From 6aac3fbba59b177803e8423d7fd9f186fc19d47c Mon Sep 17 00:00:00 2001
From: Christian Buchner <buchner@Rennsemmel.(none)>
Date: Tue, 18 Mar 2014 22:17:40 +0100
Subject: [PATCH] commit initial version 0.1

---
 AUTHORS                                       |    1 +
 COPYING                                       |    1 +
 ChangeLog                                     |    1 +
 INSTALL                                       |    9 +
 LICENSE                                       |  677 +-
 LICENSE.txt                                   |  885 ++
 Makefile.am                                   |   45 +
 Makefile.in                                   | 1107 +++
 NEWS                                          |    1 +
 README                                        |    3 +
 README.txt                                    |  117 +
 aclocal.m4                                    | 1379 +++
 autogen.sh                                    |    1 +
 blake.c                                       | 1120 +++
 ccminer.sln                                   |   26 +
 ccminer.vcxproj                               |  291 +
 ccminer.vcxproj.filters                       |  188 +
 compat.h                                      |   24 +
 compat/Makefile.am                            |    7 +
 compat/Makefile.in                            |  601 ++
 compat/getopt/getopt.h                        |   93 +
 compat/getopt/getopt_long.c                   |  554 ++
 compat/gettimeofday.c                         |   83 +
 compat/inttypes.h                             |    2 +
 compat/jansson/Makefile.am                    |   18 +
 compat/jansson/Makefile.in                    |  571 ++
 compat/jansson/config.h                       |   73 +
 compat/jansson/dump.c                         |  460 +
 compat/jansson/hashtable.c                    |  379 +
 compat/jansson/hashtable.h                    |  207 +
 compat/jansson/jansson.h                      |  200 +
 compat/jansson/jansson_private.h              |   60 +
 compat/jansson/load.c                         |  879 ++
 compat/jansson/strbuffer.c                    |   95 +
 compat/jansson/strbuffer.h                    |   31 +
 compat/jansson/utf.c                          |  190 +
 compat/jansson/utf.h                          |   28 +
 compat/jansson/util.h                         |   15 +
 compat/jansson/value.c                        |  976 ++
 compat/stdbool.h                              |    6 +
 compat/sys/time.h                             |   11 +
 compat/thrust/CHANGELOG                       |  662 ++
 compat/thrust/adjacent_difference.h           |  244 +
 compat/thrust/advance.h                       |   73 +
 compat/thrust/binary_search.h                 | 1888 ++++
 compat/thrust/copy.h                          |  505 +
 compat/thrust/count.h                         |  231 +
 compat/thrust/detail/adjacent_difference.inl  |   88 +
 compat/thrust/detail/advance.inl              |   38 +
 .../detail/allocator/allocator_traits.h       |  240 +
 .../detail/allocator/allocator_traits.inl     |  287 +
 .../detail/allocator/copy_construct_range.h   |   45 +
 .../detail/allocator/copy_construct_range.inl |  298 +
 .../allocator/default_construct_range.h       |   36 +
 .../allocator/default_construct_range.inl     |  105 +
 .../thrust/detail/allocator/destroy_range.h   |   33 +
 .../thrust/detail/allocator/destroy_range.inl |  158 +
 .../detail/allocator/fill_construct_range.h   |   35 +
 .../detail/allocator/fill_construct_range.inl |  109 +
 .../detail/allocator/malloc_allocator.h       |   52 +
 .../detail/allocator/malloc_allocator.inl     |   64 +
 .../detail/allocator/no_throw_allocator.h     |   62 +
 .../detail/allocator/tagged_allocator.h       |  101 +
 .../detail/allocator/tagged_allocator.inl     |   97 +
 .../detail/allocator/temporary_allocator.h    |   75 +
 .../detail/allocator/temporary_allocator.inl  |   59 +
 compat/thrust/detail/binary_search.inl        |  458 +
 compat/thrust/detail/config.h                 |   23 +
 compat/thrust/detail/config/compiler.h        |  103 +
 compat/thrust/detail/config/compiler_fence.h  |   52 +
 compat/thrust/detail/config/config.h          |   36 +
 compat/thrust/detail/config/debug.h           |   32 +
 compat/thrust/detail/config/device_system.h   |   61 +
 compat/thrust/detail/config/forceinline.h     |   36 +
 .../thrust/detail/config/hd_warning_disable.h |   35 +
 compat/thrust/detail/config/host_device.h     |   44 +
 compat/thrust/detail/config/host_system.h     |   58 +
 compat/thrust/detail/config/simple_defines.h  |   28 +
 compat/thrust/detail/contiguous_storage.h     |  129 +
 compat/thrust/detail/contiguous_storage.inl   |  245 +
 compat/thrust/detail/copy.h                   |   87 +
 compat/thrust/detail/copy.inl                 |  124 +
 compat/thrust/detail/copy_if.h                |   68 +
 compat/thrust/detail/copy_if.inl              |  105 +
 compat/thrust/detail/count.inl                |   80 +
 compat/thrust/detail/cstdint.h                |   79 +
 compat/thrust/detail/device_delete.inl        |   47 +
 compat/thrust/detail/device_free.inl          |   44 +
 compat/thrust/detail/device_malloc.inl        |   60 +
 compat/thrust/detail/device_new.inl           |   60 +
 compat/thrust/detail/device_ptr.inl           |   74 +
 compat/thrust/detail/device_reference.inl     |   53 +
 compat/thrust/detail/device_vector.inl        |   37 +
 .../thrust/detail/dispatch/is_trivial_copy.h  |   59 +
 compat/thrust/detail/distance.inl             |   39 +
 compat/thrust/detail/equal.inl                |   82 +
 compat/thrust/detail/execute_with_allocator.h |   84 +
 compat/thrust/detail/execution_policy.h       |   78 +
 compat/thrust/detail/extrema.inl              |  160 +
 compat/thrust/detail/fill.inl                 |   85 +
 compat/thrust/detail/find.inl                 |  109 +
 compat/thrust/detail/for_each.inl             |   90 +
 compat/thrust/detail/function.h               |  226 +
 compat/thrust/detail/functional.inl           |  122 +
 compat/thrust/detail/functional/actor.h       |  192 +
 compat/thrust/detail/functional/actor.inl     |  194 +
 compat/thrust/detail/functional/argument.h    |   75 +
 compat/thrust/detail/functional/composite.h   |  163 +
 compat/thrust/detail/functional/operators.h   |   25 +
 .../operators/arithmetic_operators.h          |  394 +
 .../operators/assignment_operator.h           |   72 +
 .../functional/operators/bitwise_operators.h  |  313 +
 .../operators/compound_assignment_operators.h |  424 +
 .../functional/operators/logical_operators.h  |  144 +
 .../functional/operators/operator_adaptors.h  |  115 +
 .../operators/relational_operators.h          |  323 +
 compat/thrust/detail/functional/placeholder.h |   39 +
 compat/thrust/detail/functional/value.h       |   80 +
 compat/thrust/detail/gather.inl               |  160 +
 compat/thrust/detail/generate.inl             |   94 +
 compat/thrust/detail/host_vector.inl          |   37 +
 compat/thrust/detail/inner_product.inl        |  104 +
 compat/thrust/detail/integer_traits.h         |  132 +
 compat/thrust/detail/internal_functional.h    |  678 ++
 compat/thrust/detail/logical.inl              |   96 +
 compat/thrust/detail/malloc_and_free.h        |   79 +
 compat/thrust/detail/merge.inl                |  217 +
 compat/thrust/detail/minmax.h                 |   55 +
 compat/thrust/detail/mismatch.inl             |   93 +
 compat/thrust/detail/mpl/math.h               |  174 +
 compat/thrust/detail/numeric_traits.h         |  130 +
 compat/thrust/detail/overlapped_copy.h        |  131 +
 compat/thrust/detail/pair.inl                 |  225 +
 compat/thrust/detail/partition.inl            |  398 +
 compat/thrust/detail/pointer.h                |  184 +
 compat/thrust/detail/pointer.inl              |  143 +
 compat/thrust/detail/range/tail_flags.h       |  124 +
 compat/thrust/detail/raw_pointer_cast.h       |   33 +
 compat/thrust/detail/raw_reference_cast.h     |  121 +
 compat/thrust/detail/raw_reference_cast.inl   |  277 +
 compat/thrust/detail/reduce.inl               |  261 +
 compat/thrust/detail/reference.h              |  167 +
 compat/thrust/detail/reference.inl            |  361 +
 .../detail/reference_forward_declaration.h    |   28 +
 compat/thrust/detail/remove.inl               |  238 +
 compat/thrust/detail/replace.inl              |  210 +
 compat/thrust/detail/reverse.inl              |   87 +
 compat/thrust/detail/scan.inl                 |  502 +
 compat/thrust/detail/scatter.inl              |  159 +
 compat/thrust/detail/sequence.inl             |  112 +
 compat/thrust/detail/set_operations.inl       |  836 ++
 compat/thrust/detail/sort.inl                 |  383 +
 compat/thrust/detail/static_assert.h          |   71 +
 compat/thrust/detail/swap.h                   |   35 +
 compat/thrust/detail/swap.inl                 |   21 +
 compat/thrust/detail/swap_ranges.inl          |   64 +
 compat/thrust/detail/tabulate.inl             |   55 +
 compat/thrust/detail/temporary_array.h        |  158 +
 compat/thrust/detail/temporary_array.inl      |  148 +
 compat/thrust/detail/temporary_buffer.h       |   71 +
 compat/thrust/detail/transform.inl            |  239 +
 compat/thrust/detail/transform_reduce.inl     |   70 +
 compat/thrust/detail/transform_scan.inl       |  115 +
 compat/thrust/detail/trivial_sequence.h       |   87 +
 compat/thrust/detail/tuple.inl                |  948 ++
 compat/thrust/detail/tuple_meta_transform.h   |  177 +
 compat/thrust/detail/tuple_transform.h        |  418 +
 compat/thrust/detail/type_traits.h            |  641 ++
 ...mediate_type_from_function_and_iterators.h |   61 +
 .../detail/type_traits/function_traits.h      |   96 +
 .../detail/type_traits/has_member_function.h  |  118 +
 .../detail/type_traits/has_nested_type.h      |   32 +
 .../detail/type_traits/has_trivial_assign.h   |   52 +
 .../detail/type_traits/is_call_possible.h     |  161 +
 .../type_traits/is_metafunction_defined.h     |   41 +
 .../iterator/is_discard_iterator.h            |   40 +
 .../type_traits/iterator/is_output_iterator.h |   66 +
 .../thrust/detail/type_traits/minimum_type.h  |  162 +
 .../detail/type_traits/pointer_traits.h       |  276 +
 compat/thrust/detail/type_traits/result_of.h  |   52 +
 compat/thrust/detail/uninitialized_copy.inl   |   93 +
 compat/thrust/detail/uninitialized_fill.inl   |   88 +
 compat/thrust/detail/unique.inl               |  320 +
 compat/thrust/detail/use_default.h            |   27 +
 compat/thrust/detail/util/align.h             |   52 +
 compat/thrust/detail/util/blocking.h          |   58 +
 compat/thrust/detail/vector_base.h            |  534 ++
 compat/thrust/detail/vector_base.inl          | 1203 +++
 compat/thrust/device_allocator.h              |  123 +
 compat/thrust/device_delete.h                 |   56 +
 compat/thrust/device_free.h                   |   68 +
 compat/thrust/device_malloc.h                 |  103 +
 compat/thrust/device_malloc_allocator.h       |  174 +
 compat/thrust/device_new.h                    |   88 +
 compat/thrust/device_new_allocator.h          |  172 +
 compat/thrust/device_ptr.h                    |  170 +
 compat/thrust/device_reference.h              |  969 ++
 compat/thrust/device_vector.h                 |  418 +
 compat/thrust/distance.h                      |   76 +
 compat/thrust/equal.h                         |  236 +
 compat/thrust/execution_policy.h              |  351 +
 compat/thrust/extrema.h                       |  798 ++
 compat/thrust/fill.h                          |  205 +
 compat/thrust/find.h                          |  382 +
 compat/thrust/for_each.h                      |  278 +
 compat/thrust/functional.h                    | 1079 +++
 compat/thrust/gather.h                        |  438 +
 compat/thrust/generate.h                      |  211 +
 compat/thrust/host_vector.h                   |  424 +
 compat/thrust/inner_product.h                 |  262 +
 compat/thrust/iterator/constant_iterator.h    |  251 +
 compat/thrust/iterator/counting_iterator.h    |  243 +
 compat/thrust/iterator/detail/any_assign.h    |   55 +
 .../thrust/iterator/detail/any_system_tag.h   |   37 +
 .../iterator/detail/constant_iterator_base.h  |   70 +
 .../iterator/detail/counting_iterator.inl     |  141 +
 .../iterator/detail/device_system_tag.h       |   40 +
 .../iterator/detail/discard_iterator_base.h   |   65 +
 .../iterator/detail/distance_from_result.h    |   42 +
 .../thrust/iterator/detail/host_system_tag.h  |   40 +
 .../iterator/detail/is_iterator_category.h    |   60 +
 .../iterator/detail/is_trivial_iterator.h     |   96 +
 .../iterator/detail/iterator_adaptor_base.h   |  111 +
 .../detail/iterator_category_to_system.h      |   95 +
 .../detail/iterator_category_to_traversal.h   |  178 +
 .../detail/iterator_facade_category.h         |  283 +
 .../iterator/detail/iterator_traits.inl       |  112 +
 .../iterator/detail/iterator_traversal_tags.h |   41 +
 .../thrust/iterator/detail/minimum_category.h |   52 +
 .../thrust/iterator/detail/minimum_system.h   |   49 +
 .../thrust/iterator/detail/normal_iterator.h  |   76 +
 .../detail/permutation_iterator_base.h        |   53 +
 compat/thrust/iterator/detail/retag.h         |  140 +
 .../iterator/detail/reverse_iterator.inl      |  108 +
 .../iterator/detail/reverse_iterator_base.h   |   42 +
 .../thrust/iterator/detail/tagged_iterator.h  |   74 +
 .../iterator/detail/transform_iterator.inl    |   72 +
 .../detail/tuple_of_iterator_references.h     |  246 +
 .../iterator/detail/universal_categories.h    |   85 +
 .../thrust/iterator/detail/zip_iterator.inl   |  151 +
 .../iterator/detail/zip_iterator_base.h       |  418 +
 compat/thrust/iterator/discard_iterator.h     |  171 +
 compat/thrust/iterator/iterator_adaptor.h     |  239 +
 compat/thrust/iterator/iterator_categories.h  |  191 +
 compat/thrust/iterator/iterator_facade.h      |  538 ++
 compat/thrust/iterator/iterator_traits.h      |   76 +
 compat/thrust/iterator/permutation_iterator.h |  210 +
 compat/thrust/iterator/retag.h                |   68 +
 compat/thrust/iterator/reverse_iterator.h     |  238 +
 compat/thrust/iterator/transform_iterator.h   |  344 +
 compat/thrust/iterator/zip_iterator.h         |  245 +
 compat/thrust/logical.h                       |  276 +
 compat/thrust/memory.h                        |  538 ++
 compat/thrust/merge.h                         |  676 ++
 compat/thrust/mismatch.h                      |  258 +
 compat/thrust/pair.h                          |  283 +
 compat/thrust/partition.h                     | 1429 +++
 compat/thrust/random.h                        |  120 +
 .../random/detail/discard_block_engine.inl    |  201 +
 .../detail/linear_congruential_engine.inl     |  163 +
 .../linear_congruential_engine_discard.h      |  107 +
 .../detail/linear_feedback_shift_engine.inl   |  158 +
 .../linear_feedback_shift_engine_wordmask.h   |   47 +
 compat/thrust/random/detail/mod.h             |   97 +
 .../random/detail/normal_distribution.inl     |  241 +
 .../random/detail/normal_distribution_base.h  |  149 +
 .../thrust/random/detail/random_core_access.h |   57 +
 .../detail/subtract_with_carry_engine.inl     |  203 +
 .../detail/uniform_int_distribution.inl       |  232 +
 .../detail/uniform_real_distribution.inl      |  217 +
 .../random/detail/xor_combine_engine.inl      |  203 +
 .../random/detail/xor_combine_engine_max.h    |  324 +
 compat/thrust/random/discard_block_engine.h   |  252 +
 .../random/linear_congruential_engine.h       |  295 +
 .../random/linear_feedback_shift_engine.h     |  230 +
 compat/thrust/random/normal_distribution.h    |  275 +
 .../random/subtract_with_carry_engine.h       |  256 +
 .../thrust/random/uniform_int_distribution.h  |  276 +
 .../thrust/random/uniform_real_distribution.h |  274 +
 compat/thrust/random/xor_combine_engine.h     |  271 +
 compat/thrust/reduce.h                        |  779 ++
 compat/thrust/remove.h                        |  800 ++
 compat/thrust/replace.h                       |  817 ++
 compat/thrust/reverse.h                       |  213 +
 compat/thrust/scan.h                          | 1552 ++++
 compat/thrust/scatter.h                       |  420 +
 compat/thrust/sequence.h                      |  293 +
 compat/thrust/set_operations.h                | 2947 ++++++
 compat/thrust/sort.h                          | 1349 +++
 compat/thrust/swap.h                          |  190 +
 .../system/cpp/detail/adjacent_difference.h   |   54 +
 .../thrust/system/cpp/detail/assign_value.h   |   42 +
 .../thrust/system/cpp/detail/binary_search.h  |   77 +
 compat/thrust/system/cpp/detail/copy.h        |   61 +
 compat/thrust/system/cpp/detail/copy_if.h     |   51 +
 compat/thrust/system/cpp/detail/count.h       |   22 +
 .../thrust/system/cpp/detail/dispatch/sort.h  |  119 +
 compat/thrust/system/cpp/detail/equal.h       |   22 +
 .../system/cpp/detail/execution_policy.h      |   84 +
 compat/thrust/system/cpp/detail/extrema.h     |   72 +
 compat/thrust/system/cpp/detail/fill.h        |   22 +
 compat/thrust/system/cpp/detail/find.h        |   52 +
 compat/thrust/system/cpp/detail/for_each.h    |   60 +
 compat/thrust/system/cpp/detail/gather.h      |   22 +
 compat/thrust/system/cpp/detail/generate.h    |   22 +
 compat/thrust/system/cpp/detail/get_value.h   |   45 +
 .../thrust/system/cpp/detail/inner_product.h  |   22 +
 compat/thrust/system/cpp/detail/iter_swap.h   |   46 +
 compat/thrust/system/cpp/detail/logical.h     |   22 +
 .../system/cpp/detail/malloc_and_free.h       |   54 +
 compat/thrust/system/cpp/detail/memory.inl    |   92 +
 compat/thrust/system/cpp/detail/merge.h       |   76 +
 compat/thrust/system/cpp/detail/mismatch.h    |   22 +
 compat/thrust/system/cpp/detail/par.h         |   66 +
 compat/thrust/system/cpp/detail/partition.h   |   95 +
 compat/thrust/system/cpp/detail/reduce.h      |   54 +
 .../thrust/system/cpp/detail/reduce_by_key.h  |   57 +
 compat/thrust/system/cpp/detail/remove.h      |   88 +
 compat/thrust/system/cpp/detail/replace.h     |   22 +
 compat/thrust/system/cpp/detail/reverse.h     |   22 +
 compat/thrust/system/cpp/detail/scan.h        |   70 +
 compat/thrust/system/cpp/detail/scan_by_key.h |   71 +
 compat/thrust/system/cpp/detail/scatter.h     |   22 +
 compat/thrust/system/cpp/detail/sequence.h    |   22 +
 .../thrust/system/cpp/detail/set_operations.h |  105 +
 compat/thrust/system/cpp/detail/sort.h        |   60 +
 compat/thrust/system/cpp/detail/swap_ranges.h |   22 +
 compat/thrust/system/cpp/detail/tabulate.h    |   22 +
 .../system/cpp/detail/temporary_buffer.h      |   22 +
 compat/thrust/system/cpp/detail/transform.h   |   22 +
 .../system/cpp/detail/transform_reduce.h      |   22 +
 .../thrust/system/cpp/detail/transform_scan.h |   22 +
 .../system/cpp/detail/uninitialized_copy.h    |   22 +
 .../system/cpp/detail/uninitialized_fill.h    |   22 +
 compat/thrust/system/cpp/detail/unique.h      |   61 +
 .../thrust/system/cpp/detail/unique_by_key.h  |   70 +
 compat/thrust/system/cpp/detail/vector.inl    |   97 +
 compat/thrust/system/cpp/execution_policy.h   |  157 +
 compat/thrust/system/cpp/memory.h             |  414 +
 compat/thrust/system/cpp/vector.h             |  149 +
 .../system/cuda/detail/adjacent_difference.h  |   51 +
 .../cuda/detail/adjacent_difference.inl       |  197 +
 .../thrust/system/cuda/detail/assign_value.h  |  198 +
 .../thrust/system/cuda/detail/binary_search.h |   22 +
 compat/thrust/system/cuda/detail/block/copy.h |  223 +
 .../system/cuda/detail/block/exclusive_scan.h |   74 +
 .../system/cuda/detail/block/inclusive_scan.h |  191 +
 .../thrust/system/cuda/detail/block/merge.h   |   74 +
 .../thrust/system/cuda/detail/block/merge.inl |  168 +
 .../system/cuda/detail/block/merging_sort.h   |  199 +
 .../system/cuda/detail/block/odd_even_sort.h  |  151 +
 .../thrust/system/cuda/detail/block/reduce.h  |   67 +
 compat/thrust/system/cuda/detail/copy.h       |   79 +
 compat/thrust/system/cuda/detail/copy.inl     |   88 +
 .../system/cuda/detail/copy_cross_system.h    |   59 +
 .../system/cuda/detail/copy_cross_system.inl  |  301 +
 .../cuda/detail/copy_device_to_device.h       |   51 +
 .../cuda/detail/copy_device_to_device.inl     |  127 +
 compat/thrust/system/cuda/detail/copy_if.h    |   49 +
 compat/thrust/system/cuda/detail/copy_if.inl  |  212 +
 compat/thrust/system/cuda/detail/count.h      |   22 +
 .../system/cuda/detail/cuda_launch_config.h   |  384 +
 .../cuda/detail/default_decomposition.h       |   45 +
 .../cuda/detail/default_decomposition.inl     |   41 +
 .../system/cuda/detail/detail/alignment.h     |  223 +
 .../cuda/detail/detail/b40c/kernel_utils.h    |  284 +
 .../cuda/detail/detail/b40c/radixsort_api.h   |  807 ++
 .../detail/b40c/radixsort_kernel_common.h     |  173 +
 .../detail/b40c/radixsort_key_conversion.h    |  352 +
 .../detail/b40c/radixsort_reduction_kernel.h  |  439 +
 .../b40c/radixsort_scanscatter_kernel.h       | 1207 +++
 .../detail/b40c/radixsort_spine_kernel.h      |  187 +
 .../cuda/detail/detail/b40c/vector_types.h    |   96 +
 .../system/cuda/detail/detail/balanced_path.h |  156 +
 .../detail/cached_temporary_allocator.h       |  156 +
 .../system/cuda/detail/detail/fast_scan.h     |   67 +
 .../system/cuda/detail/detail/fast_scan.inl   |  753 ++
 .../cuda/detail/detail/launch_calculator.h    |   82 +
 .../cuda/detail/detail/launch_calculator.inl  |  103 +
 .../cuda/detail/detail/launch_closure.h       |  114 +
 .../cuda/detail/detail/launch_closure.inl     |  207 +
 .../system/cuda/detail/detail/set_operation.h |   56 +
 .../cuda/detail/detail/set_operation.inl      |  639 ++
 .../cuda/detail/detail/stable_merge_sort.h    |   63 +
 .../cuda/detail/detail/stable_merge_sort.inl  | 1103 +++
 .../detail/detail/stable_primitive_sort.h     |   54 +
 .../detail/detail/stable_primitive_sort.inl   |  159 +
 .../cuda/detail/detail/stable_radix_sort.h    |   59 +
 .../cuda/detail/detail/stable_radix_sort.inl  |  220 +
 .../cuda/detail/detail/stable_sort_by_count.h |   53 +
 .../detail/detail/stable_sort_by_count.inl    |  179 +
 .../system/cuda/detail/detail/uninitialized.h |  261 +
 compat/thrust/system/cuda/detail/equal.h      |   22 +
 compat/thrust/system/cuda/detail/error.inl    |   95 +
 .../system/cuda/detail/execution_policy.h     |  131 +
 .../system/cuda/detail/extern_shared_ptr.h    |   58 +
 compat/thrust/system/cuda/detail/extrema.h    |   22 +
 compat/thrust/system/cuda/detail/fill.h       |   54 +
 compat/thrust/system/cuda/detail/fill.inl     |  178 +
 compat/thrust/system/cuda/detail/find.h       |   22 +
 compat/thrust/system/cuda/detail/for_each.h   |   60 +
 compat/thrust/system/cuda/detail/for_each.inl |  199 +
 compat/thrust/system/cuda/detail/gather.h     |   22 +
 compat/thrust/system/cuda/detail/generate.h   |   22 +
 compat/thrust/system/cuda/detail/get_value.h  |   93 +
 .../cuda/detail/guarded_cuda_runtime_api.h    |   39 +
 .../thrust/system/cuda/detail/inner_product.h |   22 +
 compat/thrust/system/cuda/detail/iter_swap.h  |   65 +
 compat/thrust/system/cuda/detail/logical.h    |   22 +
 .../system/cuda/detail/malloc_and_free.h      |   71 +
 compat/thrust/system/cuda/detail/memory.inl   |   94 +
 compat/thrust/system/cuda/detail/merge.h      |   50 +
 compat/thrust/system/cuda/detail/merge.inl    |  285 +
 compat/thrust/system/cuda/detail/mismatch.h   |   22 +
 compat/thrust/system/cuda/detail/par.h        |   66 +
 compat/thrust/system/cuda/detail/partition.h  |   22 +
 compat/thrust/system/cuda/detail/reduce.h     |   54 +
 compat/thrust/system/cuda/detail/reduce.inl   |  275 +
 .../thrust/system/cuda/detail/reduce_by_key.h |   59 +
 .../system/cuda/detail/reduce_by_key.inl      |  705 ++
 .../system/cuda/detail/reduce_intervals.h     |   53 +
 .../system/cuda/detail/reduce_intervals.inl   |  203 +
 compat/thrust/system/cuda/detail/remove.h     |   22 +
 compat/thrust/system/cuda/detail/replace.h    |   22 +
 compat/thrust/system/cuda/detail/reverse.h    |   22 +
 .../cuda/detail/runtime_introspection.h       |   78 +
 .../cuda/detail/runtime_introspection.inl     |  169 +
 compat/thrust/system/cuda/detail/scan.h       |   64 +
 compat/thrust/system/cuda/detail/scan.inl     |   82 +
 .../thrust/system/cuda/detail/scan_by_key.h   |   22 +
 compat/thrust/system/cuda/detail/scatter.h    |   22 +
 compat/thrust/system/cuda/detail/sequence.h   |   22 +
 .../system/cuda/detail/set_difference.inl     |  138 +
 .../system/cuda/detail/set_intersection.inl   |  129 +
 .../system/cuda/detail/set_operations.h       |   97 +
 .../cuda/detail/set_symmetric_difference.inl  |  150 +
 .../thrust/system/cuda/detail/set_union.inl   |  150 +
 compat/thrust/system/cuda/detail/sort.h       |   55 +
 compat/thrust/system/cuda/detail/sort.inl     |  287 +
 .../thrust/system/cuda/detail/swap_ranges.h   |   22 +
 .../thrust/system/cuda/detail/synchronize.h   |   41 +
 .../thrust/system/cuda/detail/synchronize.inl |   56 +
 compat/thrust/system/cuda/detail/tabulate.h   |   22 +
 .../system/cuda/detail/temporary_buffer.h     |   22 +
 .../detail/temporary_indirect_permutation.h   |  217 +
 compat/thrust/system/cuda/detail/transform.h  |   22 +
 .../system/cuda/detail/transform_reduce.h     |   22 +
 .../system/cuda/detail/transform_scan.h       |   22 +
 .../thrust/system/cuda/detail/trivial_copy.h  |   54 +
 .../system/cuda/detail/trivial_copy.inl       |  114 +
 .../system/cuda/detail/uninitialized_copy.h   |   22 +
 .../system/cuda/detail/uninitialized_fill.h   |   22 +
 compat/thrust/system/cuda/detail/unique.h     |   22 +
 .../thrust/system/cuda/detail/unique_by_key.h |   22 +
 compat/thrust/system/cuda/detail/vector.inl   |   97 +
 compat/thrust/system/cuda/error.h             |  186 +
 compat/thrust/system/cuda/execution_policy.h  |  165 +
 .../cuda/experimental/pinned_allocator.h      |  239 +
 compat/thrust/system/cuda/memory.h            |  421 +
 compat/thrust/system/cuda/vector.h            |  148 +
 .../system/detail/adl/adjacent_difference.h   |   32 +
 .../thrust/system/detail/adl/assign_value.h   |   32 +
 .../thrust/system/detail/adl/binary_search.h  |   32 +
 compat/thrust/system/detail/adl/copy.h        |   32 +
 compat/thrust/system/detail/adl/copy_if.h     |   32 +
 compat/thrust/system/detail/adl/count.h       |   32 +
 compat/thrust/system/detail/adl/equal.h       |   32 +
 compat/thrust/system/detail/adl/extrema.h     |   32 +
 compat/thrust/system/detail/adl/fill.h        |   32 +
 compat/thrust/system/detail/adl/find.h        |   32 +
 compat/thrust/system/detail/adl/for_each.h    |   32 +
 compat/thrust/system/detail/adl/gather.h      |   32 +
 compat/thrust/system/detail/adl/generate.h    |   32 +
 compat/thrust/system/detail/adl/get_value.h   |   32 +
 .../thrust/system/detail/adl/inner_product.h  |   32 +
 compat/thrust/system/detail/adl/iter_swap.h   |   32 +
 compat/thrust/system/detail/adl/logical.h     |   32 +
 .../system/detail/adl/malloc_and_free.h       |   32 +
 compat/thrust/system/detail/adl/merge.h       |   32 +
 compat/thrust/system/detail/adl/mismatch.h    |   32 +
 compat/thrust/system/detail/adl/partition.h   |   32 +
 compat/thrust/system/detail/adl/reduce.h      |   32 +
 .../thrust/system/detail/adl/reduce_by_key.h  |   32 +
 compat/thrust/system/detail/adl/remove.h      |   32 +
 compat/thrust/system/detail/adl/replace.h     |   32 +
 compat/thrust/system/detail/adl/reverse.h     |   32 +
 compat/thrust/system/detail/adl/scan.h        |   32 +
 compat/thrust/system/detail/adl/scan_by_key.h |   32 +
 compat/thrust/system/detail/adl/scatter.h     |   32 +
 compat/thrust/system/detail/adl/sequence.h    |   32 +
 .../thrust/system/detail/adl/set_operations.h |   32 +
 compat/thrust/system/detail/adl/sort.h        |   32 +
 compat/thrust/system/detail/adl/swap_ranges.h |   32 +
 compat/thrust/system/detail/adl/tabulate.h    |   32 +
 .../system/detail/adl/temporary_buffer.h      |   32 +
 compat/thrust/system/detail/adl/transform.h   |   32 +
 .../system/detail/adl/transform_reduce.h      |   32 +
 .../thrust/system/detail/adl/transform_scan.h |   32 +
 .../system/detail/adl/uninitialized_copy.h    |   32 +
 .../system/detail/adl/uninitialized_fill.h    |   32 +
 compat/thrust/system/detail/adl/unique.h      |   32 +
 .../thrust/system/detail/adl/unique_by_key.h  |   32 +
 compat/thrust/system/detail/bad_alloc.h       |   57 +
 compat/thrust/system/detail/errno.h           |  120 +
 .../thrust/system/detail/error_category.inl   |  234 +
 compat/thrust/system/detail/error_code.inl    |  197 +
 .../thrust/system/detail/error_condition.inl  |  133 +
 .../detail/generic/adjacent_difference.h      |   53 +
 .../detail/generic/adjacent_difference.inl    |   76 +
 compat/thrust/system/detail/generic/advance.h |   40 +
 .../thrust/system/detail/generic/advance.inl  |   62 +
 .../system/detail/generic/binary_search.h     |  156 +
 .../system/detail/generic/binary_search.inl   |  342 +
 compat/thrust/system/detail/generic/copy.h    |   57 +
 compat/thrust/system/detail/generic/copy.inl  |   80 +
 compat/thrust/system/detail/generic/copy_if.h |   62 +
 .../thrust/system/detail/generic/copy_if.inl  |  155 +
 compat/thrust/system/detail/generic/count.h   |   46 +
 compat/thrust/system/detail/generic/count.inl |   75 +
 .../thrust/system/detail/generic/distance.h   |   42 +
 .../thrust/system/detail/generic/distance.inl |   69 +
 compat/thrust/system/detail/generic/equal.h   |   43 +
 compat/thrust/system/detail/generic/equal.inl |   50 +
 compat/thrust/system/detail/generic/extrema.h |   76 +
 .../thrust/system/detail/generic/extrema.inl  |  244 +
 compat/thrust/system/detail/generic/fill.h    |   63 +
 compat/thrust/system/detail/generic/find.h    |   56 +
 compat/thrust/system/detail/generic/find.inl  |  141 +
 .../thrust/system/detail/generic/for_each.h   |   72 +
 compat/thrust/system/detail/generic/gather.h  |   78 +
 .../thrust/system/detail/generic/gather.inl   |  102 +
 .../thrust/system/detail/generic/generate.h   |   55 +
 .../thrust/system/detail/generic/generate.inl |   59 +
 .../system/detail/generic/inner_product.h     |   54 +
 .../system/detail/generic/inner_product.inl   |   70 +
 compat/thrust/system/detail/generic/logical.h |   56 +
 compat/thrust/system/detail/generic/memory.h  |   67 +
 .../thrust/system/detail/generic/memory.inl   |   92 +
 compat/thrust/system/detail/generic/merge.h   |   87 +
 compat/thrust/system/detail/generic/merge.inl |  125 +
 .../thrust/system/detail/generic/mismatch.h   |   56 +
 .../thrust/system/detail/generic/mismatch.inl |   70 +
 .../thrust/system/detail/generic/partition.h  |  150 +
 .../system/detail/generic/partition.inl       |  238 +
 compat/thrust/system/detail/generic/reduce.h  |   52 +
 .../thrust/system/detail/generic/reduce.inl   |   74 +
 .../system/detail/generic/reduce_by_key.h     |   86 +
 .../system/detail/generic/reduce_by_key.inl   |  212 +
 compat/thrust/system/detail/generic/remove.h  |  100 +
 .../thrust/system/detail/generic/remove.inl   |  144 +
 compat/thrust/system/detail/generic/replace.h |   92 +
 .../thrust/system/detail/generic/replace.inl  |  168 +
 compat/thrust/system/detail/generic/reverse.h |   51 +
 .../thrust/system/detail/generic/reverse.inl  |   70 +
 .../detail/generic/scalar/binary_search.h     |   85 +
 .../detail/generic/scalar/binary_search.inl   |  159 +
 compat/thrust/system/detail/generic/scan.h    |   94 +
 compat/thrust/system/detail/generic/scan.inl  |  144 +
 .../system/detail/generic/scan_by_key.h       |  137 +
 .../system/detail/generic/scan_by_key.inl     |  239 +
 compat/thrust/system/detail/generic/scatter.h |   76 +
 .../thrust/system/detail/generic/scatter.inl  |   93 +
 .../system/detail/generic/select_system.h     |  182 +
 .../thrust/system/detail/generic/sequence.h   |   61 +
 .../thrust/system/detail/generic/sequence.inl |   69 +
 .../system/detail/generic/set_operations.h    |  303 +
 .../system/detail/generic/set_operations.inl  |  449 +
 compat/thrust/system/detail/generic/sort.h    |  142 +
 compat/thrust/system/detail/generic/sort.inl  |  202 +
 .../system/detail/generic/swap_ranges.h       |   46 +
 .../system/detail/generic/swap_ranges.inl     |   73 +
 .../thrust/system/detail/generic/tabulate.h   |   55 +
 .../thrust/system/detail/generic/tabulate.inl |   59 +
 compat/thrust/system/detail/generic/tag.h     |   48 +
 .../system/detail/generic/temporary_buffer.h  |   49 +
 .../detail/generic/temporary_buffer.inl       |   60 +
 .../thrust/system/detail/generic/transform.h  |  101 +
 .../system/detail/generic/transform.inl       |  214 +
 .../system/detail/generic/transform_reduce.h  |   50 +
 .../detail/generic/transform_reduce.inl       |   53 +
 .../system/detail/generic/transform_scan.h    |   64 +
 .../system/detail/generic/transform_scan.inl  |  124 +
 .../system/detail/generic/type_traits.h       |  168 +
 .../detail/generic/uninitialized_copy.h       |   55 +
 .../detail/generic/uninitialized_copy.inl     |  187 +
 .../detail/generic/uninitialized_fill.h       |   55 +
 .../detail/generic/uninitialized_fill.inl     |  128 +
 compat/thrust/system/detail/generic/unique.h  |   74 +
 .../thrust/system/detail/generic/unique.inl   |  114 +
 .../system/detail/generic/unique_by_key.h     |   91 +
 .../system/detail/generic/unique_by_key.inl   |  142 +
 .../thrust/system/detail/internal/decompose.h |  113 +
 .../internal/scalar/adjacent_difference.h     |   70 +
 .../detail/internal/scalar/binary_search.h    |  143 +
 .../system/detail/internal/scalar/copy.h      |   56 +
 .../system/detail/internal/scalar/copy.inl    |  127 +
 .../detail/internal/scalar/copy_backward.h    |   53 +
 .../system/detail/internal/scalar/copy_if.h   |   69 +
 .../system/detail/internal/scalar/extrema.h   |  127 +
 .../system/detail/internal/scalar/find.h      |   67 +
 .../system/detail/internal/scalar/for_each.h  |   87 +
 .../detail/internal/scalar/general_copy.h     |   65 +
 .../detail/internal/scalar/insertion_sort.h   |  149 +
 .../system/detail/internal/scalar/merge.h     |   73 +
 .../system/detail/internal/scalar/merge.inl   |  145 +
 .../system/detail/internal/scalar/partition.h |  262 +
 .../system/detail/internal/scalar/reduce.h    |   69 +
 .../detail/internal/scalar/reduce_by_key.h    |  103 +
 .../system/detail/internal/scalar/remove.h    |  185 +
 .../system/detail/internal/scalar/scan.h      |  153 +
 .../detail/internal/scalar/scan_by_key.h      |  147 +
 .../detail/internal/scalar/set_operations.h   |  208 +
 .../system/detail/internal/scalar/sort.h      |   57 +
 .../system/detail/internal/scalar/sort.inl    |  161 +
 .../internal/scalar/stable_merge_sort.h       |   55 +
 .../internal/scalar/stable_merge_sort.inl     |  150 +
 .../internal/scalar/stable_primitive_sort.h   |   49 +
 .../internal/scalar/stable_primitive_sort.inl |  142 +
 .../internal/scalar/stable_radix_sort.h       |   54 +
 .../internal/scalar/stable_radix_sort.inl     |  434 +
 .../detail/internal/scalar/trivial_copy.h     |   51 +
 .../system/detail/internal/scalar/unique.h    |   90 +
 .../detail/internal/scalar/unique_by_key.h    |  109 +
 compat/thrust/system/detail/system_error.inl  |  111 +
 compat/thrust/system/error_code.h             |  521 ++
 .../system/omp/detail/adjacent_difference.h   |   50 +
 .../thrust/system/omp/detail/assign_value.h   |   23 +
 .../thrust/system/omp/detail/binary_search.h  |   73 +
 compat/thrust/system/omp/detail/copy.h        |   57 +
 compat/thrust/system/omp/detail/copy.inl      |  147 +
 compat/thrust/system/omp/detail/copy_if.h     |   51 +
 compat/thrust/system/omp/detail/copy_if.inl   |   54 +
 compat/thrust/system/omp/detail/count.h       |   23 +
 .../system/omp/detail/default_decomposition.h |   45 +
 .../omp/detail/default_decomposition.inl      |   56 +
 compat/thrust/system/omp/detail/equal.h       |   23 +
 .../system/omp/detail/execution_policy.h      |  110 +
 compat/thrust/system/omp/detail/extrema.h     |   67 +
 compat/thrust/system/omp/detail/fill.h        |   23 +
 compat/thrust/system/omp/detail/find.h        |   51 +
 compat/thrust/system/omp/detail/for_each.h    |   60 +
 compat/thrust/system/omp/detail/for_each.inl  |   97 +
 compat/thrust/system/omp/detail/gather.h      |   23 +
 compat/thrust/system/omp/detail/generate.h    |   23 +
 compat/thrust/system/omp/detail/get_value.h   |   23 +
 .../thrust/system/omp/detail/inner_product.h  |   23 +
 compat/thrust/system/omp/detail/iter_swap.h   |   23 +
 compat/thrust/system/omp/detail/logical.h     |   23 +
 .../system/omp/detail/malloc_and_free.h       |   23 +
 compat/thrust/system/omp/detail/memory.inl    |  110 +
 compat/thrust/system/omp/detail/merge.h       |   23 +
 compat/thrust/system/omp/detail/mismatch.h    |   23 +
 compat/thrust/system/omp/detail/par.h         |   66 +
 compat/thrust/system/omp/detail/partition.h   |   91 +
 compat/thrust/system/omp/detail/partition.inl |  108 +
 compat/thrust/system/omp/detail/reduce.h      |   54 +
 compat/thrust/system/omp/detail/reduce.inl    |   72 +
 .../thrust/system/omp/detail/reduce_by_key.h  |   61 +
 .../system/omp/detail/reduce_by_key.inl       |   57 +
 .../system/omp/detail/reduce_intervals.h      |   53 +
 .../system/omp/detail/reduce_intervals.inl    |   93 +
 compat/thrust/system/omp/detail/remove.h      |   81 +
 compat/thrust/system/omp/detail/remove.inl    |   94 +
 compat/thrust/system/omp/detail/replace.h     |   23 +
 compat/thrust/system/omp/detail/reverse.h     |   23 +
 compat/thrust/system/omp/detail/scan.h        |   23 +
 compat/thrust/system/omp/detail/scan_by_key.h |   23 +
 compat/thrust/system/omp/detail/scatter.h     |   23 +
 compat/thrust/system/omp/detail/sequence.h    |   23 +
 .../thrust/system/omp/detail/set_operations.h |   23 +
 compat/thrust/system/omp/detail/sort.h        |   55 +
 compat/thrust/system/omp/detail/sort.inl      |  249 +
 compat/thrust/system/omp/detail/swap_ranges.h |   23 +
 compat/thrust/system/omp/detail/tabulate.h    |   23 +
 .../system/omp/detail/temporary_buffer.h      |   22 +
 compat/thrust/system/omp/detail/transform.h   |   23 +
 .../system/omp/detail/transform_reduce.h      |   23 +
 .../thrust/system/omp/detail/transform_scan.h |   23 +
 .../system/omp/detail/uninitialized_copy.h    |   23 +
 .../system/omp/detail/uninitialized_fill.h    |   23 +
 compat/thrust/system/omp/detail/unique.h      |   59 +
 compat/thrust/system/omp/detail/unique.inl    |   66 +
 .../thrust/system/omp/detail/unique_by_key.h  |   67 +
 .../system/omp/detail/unique_by_key.inl       |   74 +
 compat/thrust/system/omp/detail/vector.inl    |   97 +
 compat/thrust/system/omp/execution_policy.h   |  156 +
 compat/thrust/system/omp/memory.h             |  414 +
 compat/thrust/system/omp/vector.h             |  149 +
 compat/thrust/system/system_error.h           |  179 +
 .../system/tbb/detail/adjacent_difference.h   |   50 +
 .../thrust/system/tbb/detail/assign_value.h   |   23 +
 .../thrust/system/tbb/detail/binary_search.h  |   23 +
 compat/thrust/system/tbb/detail/copy.h        |   57 +
 compat/thrust/system/tbb/detail/copy.inl      |  134 +
 compat/thrust/system/tbb/detail/copy_if.h     |   50 +
 compat/thrust/system/tbb/detail/copy_if.inl   |  131 +
 compat/thrust/system/tbb/detail/count.h       |   23 +
 compat/thrust/system/tbb/detail/equal.h       |   23 +
 .../system/tbb/detail/execution_policy.h      |   86 +
 compat/thrust/system/tbb/detail/extrema.h     |   67 +
 compat/thrust/system/tbb/detail/fill.h        |   23 +
 compat/thrust/system/tbb/detail/find.h        |   46 +
 compat/thrust/system/tbb/detail/for_each.h    |   54 +
 compat/thrust/system/tbb/detail/for_each.inl  |  100 +
 compat/thrust/system/tbb/detail/gather.h      |   23 +
 compat/thrust/system/tbb/detail/generate.h    |   23 +
 compat/thrust/system/tbb/detail/get_value.h   |   23 +
 .../thrust/system/tbb/detail/inner_product.h  |   23 +
 compat/thrust/system/tbb/detail/iter_swap.h   |   23 +
 compat/thrust/system/tbb/detail/logical.h     |   23 +
 .../system/tbb/detail/malloc_and_free.h       |   23 +
 compat/thrust/system/tbb/detail/memory.inl    |  110 +
 compat/thrust/system/tbb/detail/merge.h       |   70 +
 compat/thrust/system/tbb/detail/merge.inl     |  285 +
 compat/thrust/system/tbb/detail/mismatch.h    |   23 +
 compat/thrust/system/tbb/detail/par.h         |   66 +
 compat/thrust/system/tbb/detail/partition.h   |   87 +
 compat/thrust/system/tbb/detail/partition.inl |  102 +
 compat/thrust/system/tbb/detail/reduce.h      |   54 +
 compat/thrust/system/tbb/detail/reduce.inl    |  131 +
 .../thrust/system/tbb/detail/reduce_by_key.h  |   57 +
 .../system/tbb/detail/reduce_by_key.inl       |  344 +
 .../system/tbb/detail/reduce_intervals.h      |  126 +
 compat/thrust/system/tbb/detail/remove.h      |   81 +
 compat/thrust/system/tbb/detail/remove.inl    |   94 +
 compat/thrust/system/tbb/detail/replace.h     |   23 +
 compat/thrust/system/tbb/detail/reverse.h     |   23 +
 compat/thrust/system/tbb/detail/scan.h        |   64 +
 compat/thrust/system/tbb/detail/scan.inl      |  293 +
 compat/thrust/system/tbb/detail/scan_by_key.h |   23 +
 compat/thrust/system/tbb/detail/scatter.h     |   23 +
 compat/thrust/system/tbb/detail/sequence.h    |   23 +
 .../thrust/system/tbb/detail/set_operations.h |   23 +
 compat/thrust/system/tbb/detail/sort.h        |   55 +
 compat/thrust/system/tbb/detail/sort.inl      |  251 +
 compat/thrust/system/tbb/detail/swap_ranges.h |   23 +
 compat/thrust/system/tbb/detail/tabulate.h    |   23 +
 .../system/tbb/detail/temporary_buffer.h      |   22 +
 compat/thrust/system/tbb/detail/transform.h   |   23 +
 .../system/tbb/detail/transform_reduce.h      |   23 +
 .../thrust/system/tbb/detail/transform_scan.h |   23 +
 .../system/tbb/detail/uninitialized_copy.h    |   23 +
 .../system/tbb/detail/uninitialized_fill.h    |   23 +
 compat/thrust/system/tbb/detail/unique.h      |   59 +
 compat/thrust/system/tbb/detail/unique.inl    |   66 +
 .../thrust/system/tbb/detail/unique_by_key.h  |   67 +
 .../system/tbb/detail/unique_by_key.inl       |   74 +
 compat/thrust/system/tbb/detail/vector.inl    |   97 +
 compat/thrust/system/tbb/execution_policy.h   |  156 +
 compat/thrust/system/tbb/memory.h             |  414 +
 compat/thrust/system/tbb/vector.h             |  144 +
 compat/thrust/system_error.h                  |   51 +
 compat/thrust/tabulate.h                      |  128 +
 compat/thrust/transform.h                     |  720 ++
 compat/thrust/transform_reduce.h              |  197 +
 compat/thrust/transform_scan.h                |  322 +
 compat/thrust/tuple.h                         |  583 ++
 compat/thrust/uninitialized_copy.h            |  301 +
 compat/thrust/uninitialized_fill.h            |  273 +
 compat/thrust/unique.h                        |  960 ++
 compat/thrust/version.h                       |   73 +
 compat/unistd.h                               |    2 +
 compile                                       |  310 +
 config.guess                                  | 1526 ++++
 config.sub                                    | 1658 ++++
 configure                                     | 8127 +++++++++++++++++
 configure.ac                                  |  164 +
 configure.sh                                  |    1 +
 cpu-miner.c                                   | 1523 +++
 cpuminer-config.h                             |  190 +
 cpuminer-config.h.in                          |  199 +
 cuda_blake512.cu                              |  308 +
 cuda_blake512.h                               |    8 +
 cuda_combine.cu                               |  151 +
 cuda_combine.h                                |    7 +
 cuda_fugue256.cu                              |  782 ++
 cuda_fugue256.h                               |    8 +
 cuda_groestl512.cu                            |  837 ++
 cuda_groestl512.h                             |    9 +
 cuda_hefty1.cu                                |  401 +
 cuda_hefty1.h                                 |    8 +
 cuda_keccak512.cu                             |  274 +
 cuda_keccak512.h                              |    9 +
 cuda_sha256.cu                                |  274 +
 cuda_sha256.h                                 |    8 +
 depcomp                                       |  688 ++
 elist.h                                       |  251 +
 files.txt                                     |   30 +
 fugue.c                                       | 1208 +++
 fuguecoin.cpp                                 |   74 +
 groestl.c                                     | 3123 +++++++
 heavy.cu                                      |  416 +
 hefty1.c                                      |  371 +
 hefty1.h                                      |   66 +
 install-sh                                    |  527 ++
 keccak.c                                      | 1824 ++++
 miner.h                                       |  315 +
 missing                                       |  367 +
 scrypt.c                                      |  756 ++
 sha2.c                                        |  630 ++
 sph_blake.h                                   |  327 +
 sph_fugue.h                                   |   81 +
 sph_groestl.h                                 |  329 +
 sph_keccak.h                                  |  293 +
 sph_types.h                                   | 1976 ++++
 uint256.h                                     |  784 ++
 util.c                                        | 1316 +++
 807 files changed, 147888 insertions(+), 674 deletions(-)
 create mode 100644 AUTHORS
 create mode 100644 COPYING
 create mode 100644 ChangeLog
 create mode 100644 INSTALL
 create mode 100644 LICENSE.txt
 create mode 100644 Makefile.am
 create mode 100644 Makefile.in
 create mode 100644 NEWS
 create mode 100644 README
 create mode 100644 README.txt
 create mode 100644 aclocal.m4
 create mode 100755 autogen.sh
 create mode 100644 blake.c
 create mode 100644 ccminer.sln
 create mode 100644 ccminer.vcxproj
 create mode 100644 ccminer.vcxproj.filters
 create mode 100644 compat.h
 create mode 100644 compat/Makefile.am
 create mode 100644 compat/Makefile.in
 create mode 100644 compat/getopt/getopt.h
 create mode 100644 compat/getopt/getopt_long.c
 create mode 100644 compat/gettimeofday.c
 create mode 100644 compat/inttypes.h
 create mode 100644 compat/jansson/Makefile.am
 create mode 100644 compat/jansson/Makefile.in
 create mode 100644 compat/jansson/config.h
 create mode 100644 compat/jansson/dump.c
 create mode 100644 compat/jansson/hashtable.c
 create mode 100644 compat/jansson/hashtable.h
 create mode 100644 compat/jansson/jansson.h
 create mode 100644 compat/jansson/jansson_private.h
 create mode 100644 compat/jansson/load.c
 create mode 100644 compat/jansson/strbuffer.c
 create mode 100644 compat/jansson/strbuffer.h
 create mode 100644 compat/jansson/utf.c
 create mode 100644 compat/jansson/utf.h
 create mode 100644 compat/jansson/util.h
 create mode 100644 compat/jansson/value.c
 create mode 100644 compat/stdbool.h
 create mode 100644 compat/sys/time.h
 create mode 100644 compat/thrust/CHANGELOG
 create mode 100644 compat/thrust/adjacent_difference.h
 create mode 100644 compat/thrust/advance.h
 create mode 100644 compat/thrust/binary_search.h
 create mode 100644 compat/thrust/copy.h
 create mode 100644 compat/thrust/count.h
 create mode 100644 compat/thrust/detail/adjacent_difference.inl
 create mode 100644 compat/thrust/detail/advance.inl
 create mode 100644 compat/thrust/detail/allocator/allocator_traits.h
 create mode 100644 compat/thrust/detail/allocator/allocator_traits.inl
 create mode 100644 compat/thrust/detail/allocator/copy_construct_range.h
 create mode 100644 compat/thrust/detail/allocator/copy_construct_range.inl
 create mode 100644 compat/thrust/detail/allocator/default_construct_range.h
 create mode 100644 compat/thrust/detail/allocator/default_construct_range.inl
 create mode 100644 compat/thrust/detail/allocator/destroy_range.h
 create mode 100644 compat/thrust/detail/allocator/destroy_range.inl
 create mode 100644 compat/thrust/detail/allocator/fill_construct_range.h
 create mode 100644 compat/thrust/detail/allocator/fill_construct_range.inl
 create mode 100644 compat/thrust/detail/allocator/malloc_allocator.h
 create mode 100644 compat/thrust/detail/allocator/malloc_allocator.inl
 create mode 100644 compat/thrust/detail/allocator/no_throw_allocator.h
 create mode 100644 compat/thrust/detail/allocator/tagged_allocator.h
 create mode 100644 compat/thrust/detail/allocator/tagged_allocator.inl
 create mode 100644 compat/thrust/detail/allocator/temporary_allocator.h
 create mode 100644 compat/thrust/detail/allocator/temporary_allocator.inl
 create mode 100644 compat/thrust/detail/binary_search.inl
 create mode 100644 compat/thrust/detail/config.h
 create mode 100644 compat/thrust/detail/config/compiler.h
 create mode 100644 compat/thrust/detail/config/compiler_fence.h
 create mode 100644 compat/thrust/detail/config/config.h
 create mode 100644 compat/thrust/detail/config/debug.h
 create mode 100644 compat/thrust/detail/config/device_system.h
 create mode 100644 compat/thrust/detail/config/forceinline.h
 create mode 100644 compat/thrust/detail/config/hd_warning_disable.h
 create mode 100644 compat/thrust/detail/config/host_device.h
 create mode 100644 compat/thrust/detail/config/host_system.h
 create mode 100644 compat/thrust/detail/config/simple_defines.h
 create mode 100644 compat/thrust/detail/contiguous_storage.h
 create mode 100644 compat/thrust/detail/contiguous_storage.inl
 create mode 100644 compat/thrust/detail/copy.h
 create mode 100644 compat/thrust/detail/copy.inl
 create mode 100644 compat/thrust/detail/copy_if.h
 create mode 100644 compat/thrust/detail/copy_if.inl
 create mode 100644 compat/thrust/detail/count.inl
 create mode 100644 compat/thrust/detail/cstdint.h
 create mode 100644 compat/thrust/detail/device_delete.inl
 create mode 100644 compat/thrust/detail/device_free.inl
 create mode 100644 compat/thrust/detail/device_malloc.inl
 create mode 100644 compat/thrust/detail/device_new.inl
 create mode 100644 compat/thrust/detail/device_ptr.inl
 create mode 100644 compat/thrust/detail/device_reference.inl
 create mode 100644 compat/thrust/detail/device_vector.inl
 create mode 100644 compat/thrust/detail/dispatch/is_trivial_copy.h
 create mode 100644 compat/thrust/detail/distance.inl
 create mode 100644 compat/thrust/detail/equal.inl
 create mode 100644 compat/thrust/detail/execute_with_allocator.h
 create mode 100644 compat/thrust/detail/execution_policy.h
 create mode 100644 compat/thrust/detail/extrema.inl
 create mode 100644 compat/thrust/detail/fill.inl
 create mode 100644 compat/thrust/detail/find.inl
 create mode 100644 compat/thrust/detail/for_each.inl
 create mode 100644 compat/thrust/detail/function.h
 create mode 100644 compat/thrust/detail/functional.inl
 create mode 100644 compat/thrust/detail/functional/actor.h
 create mode 100644 compat/thrust/detail/functional/actor.inl
 create mode 100644 compat/thrust/detail/functional/argument.h
 create mode 100644 compat/thrust/detail/functional/composite.h
 create mode 100644 compat/thrust/detail/functional/operators.h
 create mode 100644 compat/thrust/detail/functional/operators/arithmetic_operators.h
 create mode 100644 compat/thrust/detail/functional/operators/assignment_operator.h
 create mode 100644 compat/thrust/detail/functional/operators/bitwise_operators.h
 create mode 100644 compat/thrust/detail/functional/operators/compound_assignment_operators.h
 create mode 100644 compat/thrust/detail/functional/operators/logical_operators.h
 create mode 100644 compat/thrust/detail/functional/operators/operator_adaptors.h
 create mode 100644 compat/thrust/detail/functional/operators/relational_operators.h
 create mode 100644 compat/thrust/detail/functional/placeholder.h
 create mode 100644 compat/thrust/detail/functional/value.h
 create mode 100644 compat/thrust/detail/gather.inl
 create mode 100644 compat/thrust/detail/generate.inl
 create mode 100644 compat/thrust/detail/host_vector.inl
 create mode 100644 compat/thrust/detail/inner_product.inl
 create mode 100644 compat/thrust/detail/integer_traits.h
 create mode 100644 compat/thrust/detail/internal_functional.h
 create mode 100644 compat/thrust/detail/logical.inl
 create mode 100644 compat/thrust/detail/malloc_and_free.h
 create mode 100644 compat/thrust/detail/merge.inl
 create mode 100644 compat/thrust/detail/minmax.h
 create mode 100644 compat/thrust/detail/mismatch.inl
 create mode 100644 compat/thrust/detail/mpl/math.h
 create mode 100644 compat/thrust/detail/numeric_traits.h
 create mode 100644 compat/thrust/detail/overlapped_copy.h
 create mode 100644 compat/thrust/detail/pair.inl
 create mode 100644 compat/thrust/detail/partition.inl
 create mode 100644 compat/thrust/detail/pointer.h
 create mode 100644 compat/thrust/detail/pointer.inl
 create mode 100644 compat/thrust/detail/range/tail_flags.h
 create mode 100644 compat/thrust/detail/raw_pointer_cast.h
 create mode 100644 compat/thrust/detail/raw_reference_cast.h
 create mode 100644 compat/thrust/detail/raw_reference_cast.inl
 create mode 100644 compat/thrust/detail/reduce.inl
 create mode 100644 compat/thrust/detail/reference.h
 create mode 100644 compat/thrust/detail/reference.inl
 create mode 100644 compat/thrust/detail/reference_forward_declaration.h
 create mode 100644 compat/thrust/detail/remove.inl
 create mode 100644 compat/thrust/detail/replace.inl
 create mode 100644 compat/thrust/detail/reverse.inl
 create mode 100644 compat/thrust/detail/scan.inl
 create mode 100644 compat/thrust/detail/scatter.inl
 create mode 100644 compat/thrust/detail/sequence.inl
 create mode 100644 compat/thrust/detail/set_operations.inl
 create mode 100644 compat/thrust/detail/sort.inl
 create mode 100644 compat/thrust/detail/static_assert.h
 create mode 100644 compat/thrust/detail/swap.h
 create mode 100644 compat/thrust/detail/swap.inl
 create mode 100644 compat/thrust/detail/swap_ranges.inl
 create mode 100644 compat/thrust/detail/tabulate.inl
 create mode 100644 compat/thrust/detail/temporary_array.h
 create mode 100644 compat/thrust/detail/temporary_array.inl
 create mode 100644 compat/thrust/detail/temporary_buffer.h
 create mode 100644 compat/thrust/detail/transform.inl
 create mode 100644 compat/thrust/detail/transform_reduce.inl
 create mode 100644 compat/thrust/detail/transform_scan.inl
 create mode 100644 compat/thrust/detail/trivial_sequence.h
 create mode 100644 compat/thrust/detail/tuple.inl
 create mode 100644 compat/thrust/detail/tuple_meta_transform.h
 create mode 100644 compat/thrust/detail/tuple_transform.h
 create mode 100644 compat/thrust/detail/type_traits.h
 create mode 100644 compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
 create mode 100644 compat/thrust/detail/type_traits/function_traits.h
 create mode 100644 compat/thrust/detail/type_traits/has_member_function.h
 create mode 100644 compat/thrust/detail/type_traits/has_nested_type.h
 create mode 100644 compat/thrust/detail/type_traits/has_trivial_assign.h
 create mode 100644 compat/thrust/detail/type_traits/is_call_possible.h
 create mode 100644 compat/thrust/detail/type_traits/is_metafunction_defined.h
 create mode 100644 compat/thrust/detail/type_traits/iterator/is_discard_iterator.h
 create mode 100644 compat/thrust/detail/type_traits/iterator/is_output_iterator.h
 create mode 100644 compat/thrust/detail/type_traits/minimum_type.h
 create mode 100644 compat/thrust/detail/type_traits/pointer_traits.h
 create mode 100644 compat/thrust/detail/type_traits/result_of.h
 create mode 100644 compat/thrust/detail/uninitialized_copy.inl
 create mode 100644 compat/thrust/detail/uninitialized_fill.inl
 create mode 100644 compat/thrust/detail/unique.inl
 create mode 100644 compat/thrust/detail/use_default.h
 create mode 100644 compat/thrust/detail/util/align.h
 create mode 100644 compat/thrust/detail/util/blocking.h
 create mode 100644 compat/thrust/detail/vector_base.h
 create mode 100644 compat/thrust/detail/vector_base.inl
 create mode 100644 compat/thrust/device_allocator.h
 create mode 100644 compat/thrust/device_delete.h
 create mode 100644 compat/thrust/device_free.h
 create mode 100644 compat/thrust/device_malloc.h
 create mode 100644 compat/thrust/device_malloc_allocator.h
 create mode 100644 compat/thrust/device_new.h
 create mode 100644 compat/thrust/device_new_allocator.h
 create mode 100644 compat/thrust/device_ptr.h
 create mode 100644 compat/thrust/device_reference.h
 create mode 100644 compat/thrust/device_vector.h
 create mode 100644 compat/thrust/distance.h
 create mode 100644 compat/thrust/equal.h
 create mode 100644 compat/thrust/execution_policy.h
 create mode 100644 compat/thrust/extrema.h
 create mode 100644 compat/thrust/fill.h
 create mode 100644 compat/thrust/find.h
 create mode 100644 compat/thrust/for_each.h
 create mode 100644 compat/thrust/functional.h
 create mode 100644 compat/thrust/gather.h
 create mode 100644 compat/thrust/generate.h
 create mode 100644 compat/thrust/host_vector.h
 create mode 100644 compat/thrust/inner_product.h
 create mode 100644 compat/thrust/iterator/constant_iterator.h
 create mode 100644 compat/thrust/iterator/counting_iterator.h
 create mode 100644 compat/thrust/iterator/detail/any_assign.h
 create mode 100644 compat/thrust/iterator/detail/any_system_tag.h
 create mode 100644 compat/thrust/iterator/detail/constant_iterator_base.h
 create mode 100644 compat/thrust/iterator/detail/counting_iterator.inl
 create mode 100644 compat/thrust/iterator/detail/device_system_tag.h
 create mode 100644 compat/thrust/iterator/detail/discard_iterator_base.h
 create mode 100644 compat/thrust/iterator/detail/distance_from_result.h
 create mode 100644 compat/thrust/iterator/detail/host_system_tag.h
 create mode 100644 compat/thrust/iterator/detail/is_iterator_category.h
 create mode 100644 compat/thrust/iterator/detail/is_trivial_iterator.h
 create mode 100644 compat/thrust/iterator/detail/iterator_adaptor_base.h
 create mode 100644 compat/thrust/iterator/detail/iterator_category_to_system.h
 create mode 100644 compat/thrust/iterator/detail/iterator_category_to_traversal.h
 create mode 100644 compat/thrust/iterator/detail/iterator_facade_category.h
 create mode 100644 compat/thrust/iterator/detail/iterator_traits.inl
 create mode 100644 compat/thrust/iterator/detail/iterator_traversal_tags.h
 create mode 100644 compat/thrust/iterator/detail/minimum_category.h
 create mode 100644 compat/thrust/iterator/detail/minimum_system.h
 create mode 100644 compat/thrust/iterator/detail/normal_iterator.h
 create mode 100644 compat/thrust/iterator/detail/permutation_iterator_base.h
 create mode 100644 compat/thrust/iterator/detail/retag.h
 create mode 100644 compat/thrust/iterator/detail/reverse_iterator.inl
 create mode 100644 compat/thrust/iterator/detail/reverse_iterator_base.h
 create mode 100644 compat/thrust/iterator/detail/tagged_iterator.h
 create mode 100644 compat/thrust/iterator/detail/transform_iterator.inl
 create mode 100644 compat/thrust/iterator/detail/tuple_of_iterator_references.h
 create mode 100644 compat/thrust/iterator/detail/universal_categories.h
 create mode 100644 compat/thrust/iterator/detail/zip_iterator.inl
 create mode 100644 compat/thrust/iterator/detail/zip_iterator_base.h
 create mode 100644 compat/thrust/iterator/discard_iterator.h
 create mode 100644 compat/thrust/iterator/iterator_adaptor.h
 create mode 100644 compat/thrust/iterator/iterator_categories.h
 create mode 100644 compat/thrust/iterator/iterator_facade.h
 create mode 100644 compat/thrust/iterator/iterator_traits.h
 create mode 100644 compat/thrust/iterator/permutation_iterator.h
 create mode 100644 compat/thrust/iterator/retag.h
 create mode 100644 compat/thrust/iterator/reverse_iterator.h
 create mode 100644 compat/thrust/iterator/transform_iterator.h
 create mode 100644 compat/thrust/iterator/zip_iterator.h
 create mode 100644 compat/thrust/logical.h
 create mode 100644 compat/thrust/memory.h
 create mode 100644 compat/thrust/merge.h
 create mode 100644 compat/thrust/mismatch.h
 create mode 100644 compat/thrust/pair.h
 create mode 100644 compat/thrust/partition.h
 create mode 100644 compat/thrust/random.h
 create mode 100644 compat/thrust/random/detail/discard_block_engine.inl
 create mode 100644 compat/thrust/random/detail/linear_congruential_engine.inl
 create mode 100644 compat/thrust/random/detail/linear_congruential_engine_discard.h
 create mode 100644 compat/thrust/random/detail/linear_feedback_shift_engine.inl
 create mode 100644 compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
 create mode 100644 compat/thrust/random/detail/mod.h
 create mode 100644 compat/thrust/random/detail/normal_distribution.inl
 create mode 100644 compat/thrust/random/detail/normal_distribution_base.h
 create mode 100644 compat/thrust/random/detail/random_core_access.h
 create mode 100644 compat/thrust/random/detail/subtract_with_carry_engine.inl
 create mode 100644 compat/thrust/random/detail/uniform_int_distribution.inl
 create mode 100644 compat/thrust/random/detail/uniform_real_distribution.inl
 create mode 100644 compat/thrust/random/detail/xor_combine_engine.inl
 create mode 100644 compat/thrust/random/detail/xor_combine_engine_max.h
 create mode 100644 compat/thrust/random/discard_block_engine.h
 create mode 100644 compat/thrust/random/linear_congruential_engine.h
 create mode 100644 compat/thrust/random/linear_feedback_shift_engine.h
 create mode 100644 compat/thrust/random/normal_distribution.h
 create mode 100644 compat/thrust/random/subtract_with_carry_engine.h
 create mode 100644 compat/thrust/random/uniform_int_distribution.h
 create mode 100644 compat/thrust/random/uniform_real_distribution.h
 create mode 100644 compat/thrust/random/xor_combine_engine.h
 create mode 100644 compat/thrust/reduce.h
 create mode 100644 compat/thrust/remove.h
 create mode 100644 compat/thrust/replace.h
 create mode 100644 compat/thrust/reverse.h
 create mode 100644 compat/thrust/scan.h
 create mode 100644 compat/thrust/scatter.h
 create mode 100644 compat/thrust/sequence.h
 create mode 100644 compat/thrust/set_operations.h
 create mode 100644 compat/thrust/sort.h
 create mode 100644 compat/thrust/swap.h
 create mode 100644 compat/thrust/system/cpp/detail/adjacent_difference.h
 create mode 100644 compat/thrust/system/cpp/detail/assign_value.h
 create mode 100644 compat/thrust/system/cpp/detail/binary_search.h
 create mode 100644 compat/thrust/system/cpp/detail/copy.h
 create mode 100644 compat/thrust/system/cpp/detail/copy_if.h
 create mode 100644 compat/thrust/system/cpp/detail/count.h
 create mode 100644 compat/thrust/system/cpp/detail/dispatch/sort.h
 create mode 100644 compat/thrust/system/cpp/detail/equal.h
 create mode 100644 compat/thrust/system/cpp/detail/execution_policy.h
 create mode 100644 compat/thrust/system/cpp/detail/extrema.h
 create mode 100644 compat/thrust/system/cpp/detail/fill.h
 create mode 100644 compat/thrust/system/cpp/detail/find.h
 create mode 100644 compat/thrust/system/cpp/detail/for_each.h
 create mode 100644 compat/thrust/system/cpp/detail/gather.h
 create mode 100644 compat/thrust/system/cpp/detail/generate.h
 create mode 100644 compat/thrust/system/cpp/detail/get_value.h
 create mode 100644 compat/thrust/system/cpp/detail/inner_product.h
 create mode 100644 compat/thrust/system/cpp/detail/iter_swap.h
 create mode 100644 compat/thrust/system/cpp/detail/logical.h
 create mode 100644 compat/thrust/system/cpp/detail/malloc_and_free.h
 create mode 100644 compat/thrust/system/cpp/detail/memory.inl
 create mode 100644 compat/thrust/system/cpp/detail/merge.h
 create mode 100644 compat/thrust/system/cpp/detail/mismatch.h
 create mode 100644 compat/thrust/system/cpp/detail/par.h
 create mode 100644 compat/thrust/system/cpp/detail/partition.h
 create mode 100644 compat/thrust/system/cpp/detail/reduce.h
 create mode 100644 compat/thrust/system/cpp/detail/reduce_by_key.h
 create mode 100644 compat/thrust/system/cpp/detail/remove.h
 create mode 100644 compat/thrust/system/cpp/detail/replace.h
 create mode 100644 compat/thrust/system/cpp/detail/reverse.h
 create mode 100644 compat/thrust/system/cpp/detail/scan.h
 create mode 100644 compat/thrust/system/cpp/detail/scan_by_key.h
 create mode 100644 compat/thrust/system/cpp/detail/scatter.h
 create mode 100644 compat/thrust/system/cpp/detail/sequence.h
 create mode 100644 compat/thrust/system/cpp/detail/set_operations.h
 create mode 100644 compat/thrust/system/cpp/detail/sort.h
 create mode 100644 compat/thrust/system/cpp/detail/swap_ranges.h
 create mode 100644 compat/thrust/system/cpp/detail/tabulate.h
 create mode 100644 compat/thrust/system/cpp/detail/temporary_buffer.h
 create mode 100644 compat/thrust/system/cpp/detail/transform.h
 create mode 100644 compat/thrust/system/cpp/detail/transform_reduce.h
 create mode 100644 compat/thrust/system/cpp/detail/transform_scan.h
 create mode 100644 compat/thrust/system/cpp/detail/uninitialized_copy.h
 create mode 100644 compat/thrust/system/cpp/detail/uninitialized_fill.h
 create mode 100644 compat/thrust/system/cpp/detail/unique.h
 create mode 100644 compat/thrust/system/cpp/detail/unique_by_key.h
 create mode 100644 compat/thrust/system/cpp/detail/vector.inl
 create mode 100644 compat/thrust/system/cpp/execution_policy.h
 create mode 100644 compat/thrust/system/cpp/memory.h
 create mode 100644 compat/thrust/system/cpp/vector.h
 create mode 100644 compat/thrust/system/cuda/detail/adjacent_difference.h
 create mode 100644 compat/thrust/system/cuda/detail/adjacent_difference.inl
 create mode 100644 compat/thrust/system/cuda/detail/assign_value.h
 create mode 100644 compat/thrust/system/cuda/detail/binary_search.h
 create mode 100644 compat/thrust/system/cuda/detail/block/copy.h
 create mode 100644 compat/thrust/system/cuda/detail/block/exclusive_scan.h
 create mode 100644 compat/thrust/system/cuda/detail/block/inclusive_scan.h
 create mode 100644 compat/thrust/system/cuda/detail/block/merge.h
 create mode 100644 compat/thrust/system/cuda/detail/block/merge.inl
 create mode 100644 compat/thrust/system/cuda/detail/block/merging_sort.h
 create mode 100644 compat/thrust/system/cuda/detail/block/odd_even_sort.h
 create mode 100644 compat/thrust/system/cuda/detail/block/reduce.h
 create mode 100644 compat/thrust/system/cuda/detail/copy.h
 create mode 100644 compat/thrust/system/cuda/detail/copy.inl
 create mode 100644 compat/thrust/system/cuda/detail/copy_cross_system.h
 create mode 100644 compat/thrust/system/cuda/detail/copy_cross_system.inl
 create mode 100644 compat/thrust/system/cuda/detail/copy_device_to_device.h
 create mode 100644 compat/thrust/system/cuda/detail/copy_device_to_device.inl
 create mode 100644 compat/thrust/system/cuda/detail/copy_if.h
 create mode 100644 compat/thrust/system/cuda/detail/copy_if.inl
 create mode 100644 compat/thrust/system/cuda/detail/count.h
 create mode 100644 compat/thrust/system/cuda/detail/cuda_launch_config.h
 create mode 100644 compat/thrust/system/cuda/detail/default_decomposition.h
 create mode 100644 compat/thrust/system/cuda/detail/default_decomposition.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/alignment.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/b40c/vector_types.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/balanced_path.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/fast_scan.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/fast_scan.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/launch_calculator.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/launch_calculator.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/launch_closure.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/launch_closure.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/set_operation.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/set_operation.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_merge_sort.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_radix_sort.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h
 create mode 100644 compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl
 create mode 100644 compat/thrust/system/cuda/detail/detail/uninitialized.h
 create mode 100644 compat/thrust/system/cuda/detail/equal.h
 create mode 100644 compat/thrust/system/cuda/detail/error.inl
 create mode 100644 compat/thrust/system/cuda/detail/execution_policy.h
 create mode 100644 compat/thrust/system/cuda/detail/extern_shared_ptr.h
 create mode 100644 compat/thrust/system/cuda/detail/extrema.h
 create mode 100644 compat/thrust/system/cuda/detail/fill.h
 create mode 100644 compat/thrust/system/cuda/detail/fill.inl
 create mode 100644 compat/thrust/system/cuda/detail/find.h
 create mode 100644 compat/thrust/system/cuda/detail/for_each.h
 create mode 100644 compat/thrust/system/cuda/detail/for_each.inl
 create mode 100644 compat/thrust/system/cuda/detail/gather.h
 create mode 100644 compat/thrust/system/cuda/detail/generate.h
 create mode 100644 compat/thrust/system/cuda/detail/get_value.h
 create mode 100644 compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
 create mode 100644 compat/thrust/system/cuda/detail/inner_product.h
 create mode 100644 compat/thrust/system/cuda/detail/iter_swap.h
 create mode 100644 compat/thrust/system/cuda/detail/logical.h
 create mode 100644 compat/thrust/system/cuda/detail/malloc_and_free.h
 create mode 100644 compat/thrust/system/cuda/detail/memory.inl
 create mode 100644 compat/thrust/system/cuda/detail/merge.h
 create mode 100644 compat/thrust/system/cuda/detail/merge.inl
 create mode 100644 compat/thrust/system/cuda/detail/mismatch.h
 create mode 100644 compat/thrust/system/cuda/detail/par.h
 create mode 100644 compat/thrust/system/cuda/detail/partition.h
 create mode 100644 compat/thrust/system/cuda/detail/reduce.h
 create mode 100644 compat/thrust/system/cuda/detail/reduce.inl
 create mode 100644 compat/thrust/system/cuda/detail/reduce_by_key.h
 create mode 100644 compat/thrust/system/cuda/detail/reduce_by_key.inl
 create mode 100644 compat/thrust/system/cuda/detail/reduce_intervals.h
 create mode 100644 compat/thrust/system/cuda/detail/reduce_intervals.inl
 create mode 100644 compat/thrust/system/cuda/detail/remove.h
 create mode 100644 compat/thrust/system/cuda/detail/replace.h
 create mode 100644 compat/thrust/system/cuda/detail/reverse.h
 create mode 100644 compat/thrust/system/cuda/detail/runtime_introspection.h
 create mode 100644 compat/thrust/system/cuda/detail/runtime_introspection.inl
 create mode 100644 compat/thrust/system/cuda/detail/scan.h
 create mode 100644 compat/thrust/system/cuda/detail/scan.inl
 create mode 100644 compat/thrust/system/cuda/detail/scan_by_key.h
 create mode 100644 compat/thrust/system/cuda/detail/scatter.h
 create mode 100644 compat/thrust/system/cuda/detail/sequence.h
 create mode 100644 compat/thrust/system/cuda/detail/set_difference.inl
 create mode 100644 compat/thrust/system/cuda/detail/set_intersection.inl
 create mode 100644 compat/thrust/system/cuda/detail/set_operations.h
 create mode 100644 compat/thrust/system/cuda/detail/set_symmetric_difference.inl
 create mode 100644 compat/thrust/system/cuda/detail/set_union.inl
 create mode 100644 compat/thrust/system/cuda/detail/sort.h
 create mode 100644 compat/thrust/system/cuda/detail/sort.inl
 create mode 100644 compat/thrust/system/cuda/detail/swap_ranges.h
 create mode 100644 compat/thrust/system/cuda/detail/synchronize.h
 create mode 100644 compat/thrust/system/cuda/detail/synchronize.inl
 create mode 100644 compat/thrust/system/cuda/detail/tabulate.h
 create mode 100644 compat/thrust/system/cuda/detail/temporary_buffer.h
 create mode 100644 compat/thrust/system/cuda/detail/temporary_indirect_permutation.h
 create mode 100644 compat/thrust/system/cuda/detail/transform.h
 create mode 100644 compat/thrust/system/cuda/detail/transform_reduce.h
 create mode 100644 compat/thrust/system/cuda/detail/transform_scan.h
 create mode 100644 compat/thrust/system/cuda/detail/trivial_copy.h
 create mode 100644 compat/thrust/system/cuda/detail/trivial_copy.inl
 create mode 100644 compat/thrust/system/cuda/detail/uninitialized_copy.h
 create mode 100644 compat/thrust/system/cuda/detail/uninitialized_fill.h
 create mode 100644 compat/thrust/system/cuda/detail/unique.h
 create mode 100644 compat/thrust/system/cuda/detail/unique_by_key.h
 create mode 100644 compat/thrust/system/cuda/detail/vector.inl
 create mode 100644 compat/thrust/system/cuda/error.h
 create mode 100644 compat/thrust/system/cuda/execution_policy.h
 create mode 100644 compat/thrust/system/cuda/experimental/pinned_allocator.h
 create mode 100644 compat/thrust/system/cuda/memory.h
 create mode 100644 compat/thrust/system/cuda/vector.h
 create mode 100644 compat/thrust/system/detail/adl/adjacent_difference.h
 create mode 100644 compat/thrust/system/detail/adl/assign_value.h
 create mode 100644 compat/thrust/system/detail/adl/binary_search.h
 create mode 100644 compat/thrust/system/detail/adl/copy.h
 create mode 100644 compat/thrust/system/detail/adl/copy_if.h
 create mode 100644 compat/thrust/system/detail/adl/count.h
 create mode 100644 compat/thrust/system/detail/adl/equal.h
 create mode 100644 compat/thrust/system/detail/adl/extrema.h
 create mode 100644 compat/thrust/system/detail/adl/fill.h
 create mode 100644 compat/thrust/system/detail/adl/find.h
 create mode 100644 compat/thrust/system/detail/adl/for_each.h
 create mode 100644 compat/thrust/system/detail/adl/gather.h
 create mode 100644 compat/thrust/system/detail/adl/generate.h
 create mode 100644 compat/thrust/system/detail/adl/get_value.h
 create mode 100644 compat/thrust/system/detail/adl/inner_product.h
 create mode 100644 compat/thrust/system/detail/adl/iter_swap.h
 create mode 100644 compat/thrust/system/detail/adl/logical.h
 create mode 100644 compat/thrust/system/detail/adl/malloc_and_free.h
 create mode 100644 compat/thrust/system/detail/adl/merge.h
 create mode 100644 compat/thrust/system/detail/adl/mismatch.h
 create mode 100644 compat/thrust/system/detail/adl/partition.h
 create mode 100644 compat/thrust/system/detail/adl/reduce.h
 create mode 100644 compat/thrust/system/detail/adl/reduce_by_key.h
 create mode 100644 compat/thrust/system/detail/adl/remove.h
 create mode 100644 compat/thrust/system/detail/adl/replace.h
 create mode 100644 compat/thrust/system/detail/adl/reverse.h
 create mode 100644 compat/thrust/system/detail/adl/scan.h
 create mode 100644 compat/thrust/system/detail/adl/scan_by_key.h
 create mode 100644 compat/thrust/system/detail/adl/scatter.h
 create mode 100644 compat/thrust/system/detail/adl/sequence.h
 create mode 100644 compat/thrust/system/detail/adl/set_operations.h
 create mode 100644 compat/thrust/system/detail/adl/sort.h
 create mode 100644 compat/thrust/system/detail/adl/swap_ranges.h
 create mode 100644 compat/thrust/system/detail/adl/tabulate.h
 create mode 100644 compat/thrust/system/detail/adl/temporary_buffer.h
 create mode 100644 compat/thrust/system/detail/adl/transform.h
 create mode 100644 compat/thrust/system/detail/adl/transform_reduce.h
 create mode 100644 compat/thrust/system/detail/adl/transform_scan.h
 create mode 100644 compat/thrust/system/detail/adl/uninitialized_copy.h
 create mode 100644 compat/thrust/system/detail/adl/uninitialized_fill.h
 create mode 100644 compat/thrust/system/detail/adl/unique.h
 create mode 100644 compat/thrust/system/detail/adl/unique_by_key.h
 create mode 100644 compat/thrust/system/detail/bad_alloc.h
 create mode 100644 compat/thrust/system/detail/errno.h
 create mode 100644 compat/thrust/system/detail/error_category.inl
 create mode 100644 compat/thrust/system/detail/error_code.inl
 create mode 100644 compat/thrust/system/detail/error_condition.inl
 create mode 100644 compat/thrust/system/detail/generic/adjacent_difference.h
 create mode 100644 compat/thrust/system/detail/generic/adjacent_difference.inl
 create mode 100644 compat/thrust/system/detail/generic/advance.h
 create mode 100644 compat/thrust/system/detail/generic/advance.inl
 create mode 100644 compat/thrust/system/detail/generic/binary_search.h
 create mode 100644 compat/thrust/system/detail/generic/binary_search.inl
 create mode 100644 compat/thrust/system/detail/generic/copy.h
 create mode 100644 compat/thrust/system/detail/generic/copy.inl
 create mode 100644 compat/thrust/system/detail/generic/copy_if.h
 create mode 100644 compat/thrust/system/detail/generic/copy_if.inl
 create mode 100644 compat/thrust/system/detail/generic/count.h
 create mode 100644 compat/thrust/system/detail/generic/count.inl
 create mode 100644 compat/thrust/system/detail/generic/distance.h
 create mode 100644 compat/thrust/system/detail/generic/distance.inl
 create mode 100644 compat/thrust/system/detail/generic/equal.h
 create mode 100644 compat/thrust/system/detail/generic/equal.inl
 create mode 100644 compat/thrust/system/detail/generic/extrema.h
 create mode 100644 compat/thrust/system/detail/generic/extrema.inl
 create mode 100644 compat/thrust/system/detail/generic/fill.h
 create mode 100644 compat/thrust/system/detail/generic/find.h
 create mode 100644 compat/thrust/system/detail/generic/find.inl
 create mode 100644 compat/thrust/system/detail/generic/for_each.h
 create mode 100644 compat/thrust/system/detail/generic/gather.h
 create mode 100644 compat/thrust/system/detail/generic/gather.inl
 create mode 100644 compat/thrust/system/detail/generic/generate.h
 create mode 100644 compat/thrust/system/detail/generic/generate.inl
 create mode 100644 compat/thrust/system/detail/generic/inner_product.h
 create mode 100644 compat/thrust/system/detail/generic/inner_product.inl
 create mode 100644 compat/thrust/system/detail/generic/logical.h
 create mode 100644 compat/thrust/system/detail/generic/memory.h
 create mode 100644 compat/thrust/system/detail/generic/memory.inl
 create mode 100644 compat/thrust/system/detail/generic/merge.h
 create mode 100644 compat/thrust/system/detail/generic/merge.inl
 create mode 100644 compat/thrust/system/detail/generic/mismatch.h
 create mode 100644 compat/thrust/system/detail/generic/mismatch.inl
 create mode 100644 compat/thrust/system/detail/generic/partition.h
 create mode 100644 compat/thrust/system/detail/generic/partition.inl
 create mode 100644 compat/thrust/system/detail/generic/reduce.h
 create mode 100644 compat/thrust/system/detail/generic/reduce.inl
 create mode 100644 compat/thrust/system/detail/generic/reduce_by_key.h
 create mode 100644 compat/thrust/system/detail/generic/reduce_by_key.inl
 create mode 100644 compat/thrust/system/detail/generic/remove.h
 create mode 100644 compat/thrust/system/detail/generic/remove.inl
 create mode 100644 compat/thrust/system/detail/generic/replace.h
 create mode 100644 compat/thrust/system/detail/generic/replace.inl
 create mode 100644 compat/thrust/system/detail/generic/reverse.h
 create mode 100644 compat/thrust/system/detail/generic/reverse.inl
 create mode 100644 compat/thrust/system/detail/generic/scalar/binary_search.h
 create mode 100644 compat/thrust/system/detail/generic/scalar/binary_search.inl
 create mode 100644 compat/thrust/system/detail/generic/scan.h
 create mode 100644 compat/thrust/system/detail/generic/scan.inl
 create mode 100644 compat/thrust/system/detail/generic/scan_by_key.h
 create mode 100644 compat/thrust/system/detail/generic/scan_by_key.inl
 create mode 100644 compat/thrust/system/detail/generic/scatter.h
 create mode 100644 compat/thrust/system/detail/generic/scatter.inl
 create mode 100644 compat/thrust/system/detail/generic/select_system.h
 create mode 100644 compat/thrust/system/detail/generic/sequence.h
 create mode 100644 compat/thrust/system/detail/generic/sequence.inl
 create mode 100644 compat/thrust/system/detail/generic/set_operations.h
 create mode 100644 compat/thrust/system/detail/generic/set_operations.inl
 create mode 100644 compat/thrust/system/detail/generic/sort.h
 create mode 100644 compat/thrust/system/detail/generic/sort.inl
 create mode 100644 compat/thrust/system/detail/generic/swap_ranges.h
 create mode 100644 compat/thrust/system/detail/generic/swap_ranges.inl
 create mode 100644 compat/thrust/system/detail/generic/tabulate.h
 create mode 100644 compat/thrust/system/detail/generic/tabulate.inl
 create mode 100644 compat/thrust/system/detail/generic/tag.h
 create mode 100644 compat/thrust/system/detail/generic/temporary_buffer.h
 create mode 100644 compat/thrust/system/detail/generic/temporary_buffer.inl
 create mode 100644 compat/thrust/system/detail/generic/transform.h
 create mode 100644 compat/thrust/system/detail/generic/transform.inl
 create mode 100644 compat/thrust/system/detail/generic/transform_reduce.h
 create mode 100644 compat/thrust/system/detail/generic/transform_reduce.inl
 create mode 100644 compat/thrust/system/detail/generic/transform_scan.h
 create mode 100644 compat/thrust/system/detail/generic/transform_scan.inl
 create mode 100644 compat/thrust/system/detail/generic/type_traits.h
 create mode 100644 compat/thrust/system/detail/generic/uninitialized_copy.h
 create mode 100644 compat/thrust/system/detail/generic/uninitialized_copy.inl
 create mode 100644 compat/thrust/system/detail/generic/uninitialized_fill.h
 create mode 100644 compat/thrust/system/detail/generic/uninitialized_fill.inl
 create mode 100644 compat/thrust/system/detail/generic/unique.h
 create mode 100644 compat/thrust/system/detail/generic/unique.inl
 create mode 100644 compat/thrust/system/detail/generic/unique_by_key.h
 create mode 100644 compat/thrust/system/detail/generic/unique_by_key.inl
 create mode 100644 compat/thrust/system/detail/internal/decompose.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/adjacent_difference.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/binary_search.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/copy.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/copy.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/copy_backward.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/copy_if.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/extrema.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/find.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/for_each.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/general_copy.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/insertion_sort.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/merge.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/merge.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/partition.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/reduce.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/reduce_by_key.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/remove.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/scan.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/scan_by_key.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/set_operations.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/sort.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/sort.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_merge_sort.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_radix_sort.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl
 create mode 100644 compat/thrust/system/detail/internal/scalar/trivial_copy.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/unique.h
 create mode 100644 compat/thrust/system/detail/internal/scalar/unique_by_key.h
 create mode 100644 compat/thrust/system/detail/system_error.inl
 create mode 100644 compat/thrust/system/error_code.h
 create mode 100644 compat/thrust/system/omp/detail/adjacent_difference.h
 create mode 100644 compat/thrust/system/omp/detail/assign_value.h
 create mode 100644 compat/thrust/system/omp/detail/binary_search.h
 create mode 100644 compat/thrust/system/omp/detail/copy.h
 create mode 100644 compat/thrust/system/omp/detail/copy.inl
 create mode 100644 compat/thrust/system/omp/detail/copy_if.h
 create mode 100644 compat/thrust/system/omp/detail/copy_if.inl
 create mode 100644 compat/thrust/system/omp/detail/count.h
 create mode 100644 compat/thrust/system/omp/detail/default_decomposition.h
 create mode 100644 compat/thrust/system/omp/detail/default_decomposition.inl
 create mode 100644 compat/thrust/system/omp/detail/equal.h
 create mode 100644 compat/thrust/system/omp/detail/execution_policy.h
 create mode 100644 compat/thrust/system/omp/detail/extrema.h
 create mode 100644 compat/thrust/system/omp/detail/fill.h
 create mode 100644 compat/thrust/system/omp/detail/find.h
 create mode 100644 compat/thrust/system/omp/detail/for_each.h
 create mode 100644 compat/thrust/system/omp/detail/for_each.inl
 create mode 100644 compat/thrust/system/omp/detail/gather.h
 create mode 100644 compat/thrust/system/omp/detail/generate.h
 create mode 100644 compat/thrust/system/omp/detail/get_value.h
 create mode 100644 compat/thrust/system/omp/detail/inner_product.h
 create mode 100644 compat/thrust/system/omp/detail/iter_swap.h
 create mode 100644 compat/thrust/system/omp/detail/logical.h
 create mode 100644 compat/thrust/system/omp/detail/malloc_and_free.h
 create mode 100644 compat/thrust/system/omp/detail/memory.inl
 create mode 100644 compat/thrust/system/omp/detail/merge.h
 create mode 100644 compat/thrust/system/omp/detail/mismatch.h
 create mode 100644 compat/thrust/system/omp/detail/par.h
 create mode 100644 compat/thrust/system/omp/detail/partition.h
 create mode 100644 compat/thrust/system/omp/detail/partition.inl
 create mode 100644 compat/thrust/system/omp/detail/reduce.h
 create mode 100644 compat/thrust/system/omp/detail/reduce.inl
 create mode 100644 compat/thrust/system/omp/detail/reduce_by_key.h
 create mode 100644 compat/thrust/system/omp/detail/reduce_by_key.inl
 create mode 100644 compat/thrust/system/omp/detail/reduce_intervals.h
 create mode 100644 compat/thrust/system/omp/detail/reduce_intervals.inl
 create mode 100644 compat/thrust/system/omp/detail/remove.h
 create mode 100644 compat/thrust/system/omp/detail/remove.inl
 create mode 100644 compat/thrust/system/omp/detail/replace.h
 create mode 100644 compat/thrust/system/omp/detail/reverse.h
 create mode 100644 compat/thrust/system/omp/detail/scan.h
 create mode 100644 compat/thrust/system/omp/detail/scan_by_key.h
 create mode 100644 compat/thrust/system/omp/detail/scatter.h
 create mode 100644 compat/thrust/system/omp/detail/sequence.h
 create mode 100644 compat/thrust/system/omp/detail/set_operations.h
 create mode 100644 compat/thrust/system/omp/detail/sort.h
 create mode 100644 compat/thrust/system/omp/detail/sort.inl
 create mode 100644 compat/thrust/system/omp/detail/swap_ranges.h
 create mode 100644 compat/thrust/system/omp/detail/tabulate.h
 create mode 100644 compat/thrust/system/omp/detail/temporary_buffer.h
 create mode 100644 compat/thrust/system/omp/detail/transform.h
 create mode 100644 compat/thrust/system/omp/detail/transform_reduce.h
 create mode 100644 compat/thrust/system/omp/detail/transform_scan.h
 create mode 100644 compat/thrust/system/omp/detail/uninitialized_copy.h
 create mode 100644 compat/thrust/system/omp/detail/uninitialized_fill.h
 create mode 100644 compat/thrust/system/omp/detail/unique.h
 create mode 100644 compat/thrust/system/omp/detail/unique.inl
 create mode 100644 compat/thrust/system/omp/detail/unique_by_key.h
 create mode 100644 compat/thrust/system/omp/detail/unique_by_key.inl
 create mode 100644 compat/thrust/system/omp/detail/vector.inl
 create mode 100644 compat/thrust/system/omp/execution_policy.h
 create mode 100644 compat/thrust/system/omp/memory.h
 create mode 100644 compat/thrust/system/omp/vector.h
 create mode 100644 compat/thrust/system/system_error.h
 create mode 100644 compat/thrust/system/tbb/detail/adjacent_difference.h
 create mode 100644 compat/thrust/system/tbb/detail/assign_value.h
 create mode 100644 compat/thrust/system/tbb/detail/binary_search.h
 create mode 100644 compat/thrust/system/tbb/detail/copy.h
 create mode 100644 compat/thrust/system/tbb/detail/copy.inl
 create mode 100644 compat/thrust/system/tbb/detail/copy_if.h
 create mode 100644 compat/thrust/system/tbb/detail/copy_if.inl
 create mode 100644 compat/thrust/system/tbb/detail/count.h
 create mode 100644 compat/thrust/system/tbb/detail/equal.h
 create mode 100644 compat/thrust/system/tbb/detail/execution_policy.h
 create mode 100644 compat/thrust/system/tbb/detail/extrema.h
 create mode 100644 compat/thrust/system/tbb/detail/fill.h
 create mode 100644 compat/thrust/system/tbb/detail/find.h
 create mode 100644 compat/thrust/system/tbb/detail/for_each.h
 create mode 100644 compat/thrust/system/tbb/detail/for_each.inl
 create mode 100644 compat/thrust/system/tbb/detail/gather.h
 create mode 100644 compat/thrust/system/tbb/detail/generate.h
 create mode 100644 compat/thrust/system/tbb/detail/get_value.h
 create mode 100644 compat/thrust/system/tbb/detail/inner_product.h
 create mode 100644 compat/thrust/system/tbb/detail/iter_swap.h
 create mode 100644 compat/thrust/system/tbb/detail/logical.h
 create mode 100644 compat/thrust/system/tbb/detail/malloc_and_free.h
 create mode 100644 compat/thrust/system/tbb/detail/memory.inl
 create mode 100644 compat/thrust/system/tbb/detail/merge.h
 create mode 100644 compat/thrust/system/tbb/detail/merge.inl
 create mode 100644 compat/thrust/system/tbb/detail/mismatch.h
 create mode 100644 compat/thrust/system/tbb/detail/par.h
 create mode 100644 compat/thrust/system/tbb/detail/partition.h
 create mode 100644 compat/thrust/system/tbb/detail/partition.inl
 create mode 100644 compat/thrust/system/tbb/detail/reduce.h
 create mode 100644 compat/thrust/system/tbb/detail/reduce.inl
 create mode 100644 compat/thrust/system/tbb/detail/reduce_by_key.h
 create mode 100644 compat/thrust/system/tbb/detail/reduce_by_key.inl
 create mode 100644 compat/thrust/system/tbb/detail/reduce_intervals.h
 create mode 100644 compat/thrust/system/tbb/detail/remove.h
 create mode 100644 compat/thrust/system/tbb/detail/remove.inl
 create mode 100644 compat/thrust/system/tbb/detail/replace.h
 create mode 100644 compat/thrust/system/tbb/detail/reverse.h
 create mode 100644 compat/thrust/system/tbb/detail/scan.h
 create mode 100644 compat/thrust/system/tbb/detail/scan.inl
 create mode 100644 compat/thrust/system/tbb/detail/scan_by_key.h
 create mode 100644 compat/thrust/system/tbb/detail/scatter.h
 create mode 100644 compat/thrust/system/tbb/detail/sequence.h
 create mode 100644 compat/thrust/system/tbb/detail/set_operations.h
 create mode 100644 compat/thrust/system/tbb/detail/sort.h
 create mode 100644 compat/thrust/system/tbb/detail/sort.inl
 create mode 100644 compat/thrust/system/tbb/detail/swap_ranges.h
 create mode 100644 compat/thrust/system/tbb/detail/tabulate.h
 create mode 100644 compat/thrust/system/tbb/detail/temporary_buffer.h
 create mode 100644 compat/thrust/system/tbb/detail/transform.h
 create mode 100644 compat/thrust/system/tbb/detail/transform_reduce.h
 create mode 100644 compat/thrust/system/tbb/detail/transform_scan.h
 create mode 100644 compat/thrust/system/tbb/detail/uninitialized_copy.h
 create mode 100644 compat/thrust/system/tbb/detail/uninitialized_fill.h
 create mode 100644 compat/thrust/system/tbb/detail/unique.h
 create mode 100644 compat/thrust/system/tbb/detail/unique.inl
 create mode 100644 compat/thrust/system/tbb/detail/unique_by_key.h
 create mode 100644 compat/thrust/system/tbb/detail/unique_by_key.inl
 create mode 100644 compat/thrust/system/tbb/detail/vector.inl
 create mode 100644 compat/thrust/system/tbb/execution_policy.h
 create mode 100644 compat/thrust/system/tbb/memory.h
 create mode 100644 compat/thrust/system/tbb/vector.h
 create mode 100644 compat/thrust/system_error.h
 create mode 100644 compat/thrust/tabulate.h
 create mode 100644 compat/thrust/transform.h
 create mode 100644 compat/thrust/transform_reduce.h
 create mode 100644 compat/thrust/transform_scan.h
 create mode 100644 compat/thrust/tuple.h
 create mode 100644 compat/thrust/uninitialized_copy.h
 create mode 100644 compat/thrust/uninitialized_fill.h
 create mode 100644 compat/thrust/unique.h
 create mode 100644 compat/thrust/version.h
 create mode 100644 compat/unistd.h
 create mode 100644 compile
 create mode 100644 config.guess
 create mode 100644 config.sub
 create mode 100755 configure
 create mode 100644 configure.ac
 create mode 100644 configure.sh
 create mode 100644 cpu-miner.c
 create mode 100644 cpuminer-config.h
 create mode 100644 cpuminer-config.h.in
 create mode 100644 cuda_blake512.cu
 create mode 100644 cuda_blake512.h
 create mode 100644 cuda_combine.cu
 create mode 100644 cuda_combine.h
 create mode 100644 cuda_fugue256.cu
 create mode 100644 cuda_fugue256.h
 create mode 100644 cuda_groestl512.cu
 create mode 100644 cuda_groestl512.h
 create mode 100644 cuda_hefty1.cu
 create mode 100644 cuda_hefty1.h
 create mode 100644 cuda_keccak512.cu
 create mode 100644 cuda_keccak512.h
 create mode 100644 cuda_sha256.cu
 create mode 100644 cuda_sha256.h
 create mode 100644 depcomp
 create mode 100644 elist.h
 create mode 100644 files.txt
 create mode 100644 fugue.c
 create mode 100644 fuguecoin.cpp
 create mode 100644 groestl.c
 create mode 100644 heavy.cu
 create mode 100644 hefty1.c
 create mode 100644 hefty1.h
 create mode 100644 install-sh
 create mode 100644 keccak.c
 create mode 100644 miner.h
 create mode 100644 missing
 create mode 100644 scrypt.c
 create mode 100644 sha2.c
 create mode 100644 sph_blake.h
 create mode 100644 sph_fugue.h
 create mode 100644 sph_groestl.h
 create mode 100644 sph_keccak.h
 create mode 100644 sph_types.h
 create mode 100644 uint256.h
 create mode 100644 util.c

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..5408215
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+See README.txt
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..805d7df
--- /dev/null
+++ b/COPYING
@@ -0,0 +1 @@
+See LICENSE.txt
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..5408215
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1 @@
+See README.txt
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..2ee1151
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,9 @@
+
+It is advised to run ./autogen.sh before./configure (autoconf and automake
+need to be installed on your system for autogen.sh to work)
+
+./configure has an option named --with-cuda that allows you to specify
+where your CUDA 5.5 toolkit is installed (usually /usr/local/cuda-5.5,
+but some distros may have a different default location)
+
+See README.txt
diff --git a/LICENSE b/LICENSE
index 70566f2..2d7f3b9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,674 +1,3 @@
-GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    {one line to give the program's name and a brief idea of what it does.}
-    Copyright (C) {year}  {name of author}
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    {project}  Copyright (C) {year}  {fullname}
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
\ No newline at end of file
+ccminer is available under the terms of the GNU Public License version 3.
+
+See LICENSE.TXT for details.
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..8e70fb3
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,885 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
+
+
+
+
+
+
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..26f8d7e
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,45 @@
+
+if WANT_JANSSON
+JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
+else
+JANSSON_INCLUDES=
+endif
+
+EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
+			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
+			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
+
+SUBDIRS		= compat
+
+bin_PROGRAMS	= ccminer
+
+ccminer_SOURCES		= elist.h miner.h compat.h \
+			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
+			  compat/sys/time.h compat/getopt/getopt.h \
+			  cpu-miner.c util.c blake.c groestl.c hefty1.c keccak.c scrypt.c sha2.c \
+			  sph_blake.h sph_groestl.h sph_keccak.h sph_types.h \
+			  heavy.cu \
+			  cuda_blake512.cu cuda_blake512.h \
+			  cuda_combine.cu cuda_combine.h \
+			  cuda_groestl512.cu cuda_groestl512.h \
+			  cuda_hefty1.cu cuda_hefty1.h \
+			  cuda_keccak512.cu cuda_keccak512.h \
+			  cuda_sha256.cu cuda_sha256.h \
+			  cuda_fugue256.cu \
+			  fuguecoin.cpp fugue.c sph_fugue.h uint256.h
+			  
+
+ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
+ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
+ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
+
+.cu.o:
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
+## Thrust needs Compute 2.0 minimum
+#heavy.o: heavy.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+#
+#cuda_hefty1.o: cuda_hefty1.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
diff --git a/Makefile.in b/Makefile.in
new file mode 100644
index 0000000..963c70d
--- /dev/null
+++ b/Makefile.in
@@ -0,0 +1,1107 @@
+# Makefile.in generated by automake 1.13.3 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+bin_PROGRAMS = ccminer$(EXEEXT)
+subdir = .
+DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
+	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/configure $(am__configure_deps) \
+	$(srcdir)/cpuminer-config.h.in depcomp COPYING compile \
+	config.guess config.sub install-sh missing
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
+ configure.lineno config.status.lineno
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = cpuminer-config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__installdirs = "$(DESTDIR)$(bindir)"
+PROGRAMS = $(bin_PROGRAMS)
+am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \
+	ccminer-util.$(OBJEXT) ccminer-blake.$(OBJEXT) \
+	ccminer-groestl.$(OBJEXT) ccminer-hefty1.$(OBJEXT) \
+	ccminer-keccak.$(OBJEXT) ccminer-scrypt.$(OBJEXT) \
+	ccminer-sha2.$(OBJEXT) heavy.$(OBJEXT) cuda_blake512.$(OBJEXT) \
+	cuda_combine.$(OBJEXT) cuda_groestl512.$(OBJEXT) \
+	cuda_hefty1.$(OBJEXT) cuda_keccak512.$(OBJEXT) \
+	cuda_sha256.$(OBJEXT) cuda_fugue256.$(OBJEXT) \
+	ccminer-fuguecoin.$(OBJEXT) ccminer-fugue.$(OBJEXT)
+ccminer_OBJECTS = $(am_ccminer_OBJECTS)
+ccminer_DEPENDENCIES =
+ccminer_LINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(ccminer_LDFLAGS) \
+	$(LDFLAGS) -o $@
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+AM_V_CXX = $(am__v_CXX_@AM_V@)
+am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
+am__v_CXX_0 = @echo "  CXX     " $@;
+am__v_CXX_1 = 
+CXXLD = $(CXX)
+CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
+	-o $@
+AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
+am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
+am__v_CXXLD_0 = @echo "  CXXLD   " $@;
+am__v_CXXLD_1 = 
+SOURCES = $(ccminer_SOURCES)
+DIST_SOURCES = $(ccminer_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+	ctags-recursive dvi-recursive html-recursive info-recursive \
+	install-data-recursive install-dvi-recursive \
+	install-exec-recursive install-html-recursive \
+	install-info-recursive install-pdf-recursive \
+	install-ps-recursive install-recursive installcheck-recursive \
+	installdirs-recursive pdf-recursive ps-recursive \
+	tags-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+	cscope distdir dist dist-all distcheck
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
+	$(LISP)cpuminer-config.h.in
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+CSCOPE = cscope
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+am__remove_distdir = \
+  if test -d "$(distdir)"; then \
+    find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
+      && rm -rf "$(distdir)" \
+      || { sleep 5 && rm -rf "$(distdir)"; }; \
+  else :; fi
+am__post_remove_distdir = $(am__remove_distdir)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+DIST_ARCHIVES = $(distdir).tar.gz
+GZIP_ENV = --best
+DIST_TARGETS = dist-gzip
+distuninstallcheck_listfiles = find . -type f -print
+am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
+  | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
+distcleancheck_listfiles = find . -type f -print
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CUDA_CFLAGS = @CUDA_CFLAGS@
+CUDA_LDFLAGS = @CUDA_LDFLAGS@
+CUDA_LIBS = @CUDA_LIBS@
+CXX = @CXX@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+JANSSON_LIBS = @JANSSON_LIBS@
+LDFLAGS = @LDFLAGS@
+LIBCURL = @LIBCURL@
+LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+NVCC = @NVCC@
+OBJEXT = @OBJEXT@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PTHREAD_FLAGS = @PTHREAD_FLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WS2_LIBS = @WS2_LIBS@
+_libcurl_config = @_libcurl_config@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+@WANT_JANSSON_FALSE@JANSSON_INCLUDES = 
+@WANT_JANSSON_TRUE@JANSSON_INCLUDES = -I$(top_srcdir)/compat/jansson
+EXTRA_DIST = autogen.sh README.txt LICENSE.txt \
+			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
+			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
+
+SUBDIRS = compat
+ccminer_SOURCES = elist.h miner.h compat.h \
+			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
+			  compat/sys/time.h compat/getopt/getopt.h \
+			  cpu-miner.c util.c blake.c groestl.c hefty1.c keccak.c scrypt.c sha2.c \
+			  sph_blake.h sph_groestl.h sph_keccak.h sph_types.h \
+			  heavy.cu \
+			  cuda_blake512.cu cuda_blake512.h \
+			  cuda_combine.cu cuda_combine.h \
+			  cuda_groestl512.cu cuda_groestl512.h \
+			  cuda_hefty1.cu cuda_hefty1.h \
+			  cuda_keccak512.cu cuda_keccak512.h \
+			  cuda_sha256.cu cuda_sha256.h \
+			  cuda_fugue256.cu \
+			  fuguecoin.cpp fugue.c sph_fugue.h uint256.h
+
+ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
+ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
+ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
+all: cpuminer-config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .cpp .cu .o .obj
+am--refresh: Makefile
+	@:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      echo ' cd $(srcdir) && $(AUTOMAKE) --gnu'; \
+	      $(am__cd) $(srcdir) && $(AUTOMAKE) --gnu \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    echo ' $(SHELL) ./config.status'; \
+	    $(SHELL) ./config.status;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	$(am__cd) $(srcdir) && $(AUTOCONF)
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
+$(am__aclocal_m4_deps):
+
+cpuminer-config.h: stamp-h1
+	@if test ! -f $@; then rm -f stamp-h1; else :; fi
+	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
+
+stamp-h1: $(srcdir)/cpuminer-config.h.in $(top_builddir)/config.status
+	@rm -f stamp-h1
+	cd $(top_builddir) && $(SHELL) ./config.status cpuminer-config.h
+$(srcdir)/cpuminer-config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
+	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
+	rm -f stamp-h1
+	touch $@
+
+distclean-hdr:
+	-rm -f cpuminer-config.h stamp-h1
+install-binPROGRAMS: $(bin_PROGRAMS)
+	@$(NORMAL_INSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \
+	fi; \
+	for p in $$list; do echo "$$p $$p"; done | \
+	sed 's/$(EXEEXT)$$//' | \
+	while read p p1; do if test -f $$p \
+	  ; then echo "$$p"; echo "$$p"; else :; fi; \
+	done | \
+	sed -e 'p;s,.*/,,;n;h' \
+	    -e 's|.*|.|' \
+	    -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
+	sed 'N;N;N;s,\n, ,g' | \
+	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
+	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
+	    if ($$2 == $$4) files[d] = files[d] " " $$1; \
+	    else { print "f", $$3 "/" $$4, $$1; } } \
+	  END { for (d in files) print "f", d, files[d] }' | \
+	while read type dir files; do \
+	    if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
+	    test -z "$$files" || { \
+	      echo " $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \
+	      $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
+	    } \
+	; done
+
+uninstall-binPROGRAMS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
+	files=`for p in $$list; do echo "$$p"; done | \
+	  sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
+	      -e 's/$$/$(EXEEXT)/' \
+	`; \
+	test -n "$$list" || exit 0; \
+	echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \
+	cd "$(DESTDIR)$(bindir)" && rm -f $$files
+
+clean-binPROGRAMS:
+	-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
+
+ccminer$(EXEEXT): $(ccminer_OBJECTS) $(ccminer_DEPENDENCIES) $(EXTRA_ccminer_DEPENDENCIES) 
+	@rm -f ccminer$(EXEEXT)
+	$(AM_V_CXXLD)$(ccminer_LINK) $(ccminer_OBJECTS) $(ccminer_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-blake.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-cpu-miner.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fugue.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fuguecoin.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestl.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-hefty1.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-keccak.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-scrypt.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-sha2.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-util.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+ccminer-cpu-miner.o: cpu-miner.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-cpu-miner.o -MD -MP -MF $(DEPDIR)/ccminer-cpu-miner.Tpo -c -o ccminer-cpu-miner.o `test -f 'cpu-miner.c' || echo '$(srcdir)/'`cpu-miner.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-cpu-miner.Tpo $(DEPDIR)/ccminer-cpu-miner.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='cpu-miner.c' object='ccminer-cpu-miner.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-cpu-miner.o `test -f 'cpu-miner.c' || echo '$(srcdir)/'`cpu-miner.c
+
+ccminer-cpu-miner.obj: cpu-miner.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-cpu-miner.obj -MD -MP -MF $(DEPDIR)/ccminer-cpu-miner.Tpo -c -o ccminer-cpu-miner.obj `if test -f 'cpu-miner.c'; then $(CYGPATH_W) 'cpu-miner.c'; else $(CYGPATH_W) '$(srcdir)/cpu-miner.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-cpu-miner.Tpo $(DEPDIR)/ccminer-cpu-miner.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='cpu-miner.c' object='ccminer-cpu-miner.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-cpu-miner.obj `if test -f 'cpu-miner.c'; then $(CYGPATH_W) 'cpu-miner.c'; else $(CYGPATH_W) '$(srcdir)/cpu-miner.c'; fi`
+
+ccminer-util.o: util.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-util.o -MD -MP -MF $(DEPDIR)/ccminer-util.Tpo -c -o ccminer-util.o `test -f 'util.c' || echo '$(srcdir)/'`util.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-util.Tpo $(DEPDIR)/ccminer-util.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='util.c' object='ccminer-util.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.o `test -f 'util.c' || echo '$(srcdir)/'`util.c
+
+ccminer-util.obj: util.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-util.obj -MD -MP -MF $(DEPDIR)/ccminer-util.Tpo -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-util.Tpo $(DEPDIR)/ccminer-util.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='util.c' object='ccminer-util.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi`
+
+ccminer-blake.o: blake.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'blake.c' || echo '$(srcdir)/'`blake.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='blake.c' object='ccminer-blake.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.o `test -f 'blake.c' || echo '$(srcdir)/'`blake.c
+
+ccminer-blake.obj: blake.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.obj -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.obj `if test -f 'blake.c'; then $(CYGPATH_W) 'blake.c'; else $(CYGPATH_W) '$(srcdir)/blake.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='blake.c' object='ccminer-blake.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.obj `if test -f 'blake.c'; then $(CYGPATH_W) 'blake.c'; else $(CYGPATH_W) '$(srcdir)/blake.c'; fi`
+
+ccminer-groestl.o: groestl.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.o -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.o `test -f 'groestl.c' || echo '$(srcdir)/'`groestl.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='groestl.c' object='ccminer-groestl.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.o `test -f 'groestl.c' || echo '$(srcdir)/'`groestl.c
+
+ccminer-groestl.obj: groestl.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.obj -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.obj `if test -f 'groestl.c'; then $(CYGPATH_W) 'groestl.c'; else $(CYGPATH_W) '$(srcdir)/groestl.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='groestl.c' object='ccminer-groestl.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'groestl.c'; then $(CYGPATH_W) 'groestl.c'; else $(CYGPATH_W) '$(srcdir)/groestl.c'; fi`
+
+ccminer-hefty1.o: hefty1.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.o -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='hefty1.c' object='ccminer-hefty1.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c
+
+ccminer-hefty1.obj: hefty1.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.obj -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.obj `if test -f 'hefty1.c'; then $(CYGPATH_W) 'hefty1.c'; else $(CYGPATH_W) '$(srcdir)/hefty1.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='hefty1.c' object='ccminer-hefty1.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-hefty1.obj `if test -f 'hefty1.c'; then $(CYGPATH_W) 'hefty1.c'; else $(CYGPATH_W) '$(srcdir)/hefty1.c'; fi`
+
+ccminer-keccak.o: keccak.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'keccak.c' || echo '$(srcdir)/'`keccak.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='keccak.c' object='ccminer-keccak.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.o `test -f 'keccak.c' || echo '$(srcdir)/'`keccak.c
+
+ccminer-keccak.obj: keccak.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.obj -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.obj `if test -f 'keccak.c'; then $(CYGPATH_W) 'keccak.c'; else $(CYGPATH_W) '$(srcdir)/keccak.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='keccak.c' object='ccminer-keccak.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'keccak.c'; then $(CYGPATH_W) 'keccak.c'; else $(CYGPATH_W) '$(srcdir)/keccak.c'; fi`
+
+ccminer-scrypt.o: scrypt.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-scrypt.o -MD -MP -MF $(DEPDIR)/ccminer-scrypt.Tpo -c -o ccminer-scrypt.o `test -f 'scrypt.c' || echo '$(srcdir)/'`scrypt.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-scrypt.Tpo $(DEPDIR)/ccminer-scrypt.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='scrypt.c' object='ccminer-scrypt.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-scrypt.o `test -f 'scrypt.c' || echo '$(srcdir)/'`scrypt.c
+
+ccminer-scrypt.obj: scrypt.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-scrypt.obj -MD -MP -MF $(DEPDIR)/ccminer-scrypt.Tpo -c -o ccminer-scrypt.obj `if test -f 'scrypt.c'; then $(CYGPATH_W) 'scrypt.c'; else $(CYGPATH_W) '$(srcdir)/scrypt.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-scrypt.Tpo $(DEPDIR)/ccminer-scrypt.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='scrypt.c' object='ccminer-scrypt.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-scrypt.obj `if test -f 'scrypt.c'; then $(CYGPATH_W) 'scrypt.c'; else $(CYGPATH_W) '$(srcdir)/scrypt.c'; fi`
+
+ccminer-sha2.o: sha2.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-sha2.o -MD -MP -MF $(DEPDIR)/ccminer-sha2.Tpo -c -o ccminer-sha2.o `test -f 'sha2.c' || echo '$(srcdir)/'`sha2.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-sha2.Tpo $(DEPDIR)/ccminer-sha2.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sha2.c' object='ccminer-sha2.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-sha2.o `test -f 'sha2.c' || echo '$(srcdir)/'`sha2.c
+
+ccminer-sha2.obj: sha2.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-sha2.obj -MD -MP -MF $(DEPDIR)/ccminer-sha2.Tpo -c -o ccminer-sha2.obj `if test -f 'sha2.c'; then $(CYGPATH_W) 'sha2.c'; else $(CYGPATH_W) '$(srcdir)/sha2.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-sha2.Tpo $(DEPDIR)/ccminer-sha2.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='sha2.c' object='ccminer-sha2.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-sha2.obj `if test -f 'sha2.c'; then $(CYGPATH_W) 'sha2.c'; else $(CYGPATH_W) '$(srcdir)/sha2.c'; fi`
+
+ccminer-fugue.o: fugue.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.o -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.o `test -f 'fugue.c' || echo '$(srcdir)/'`fugue.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='fugue.c' object='ccminer-fugue.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.o `test -f 'fugue.c' || echo '$(srcdir)/'`fugue.c
+
+ccminer-fugue.obj: fugue.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.obj -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.obj `if test -f 'fugue.c'; then $(CYGPATH_W) 'fugue.c'; else $(CYGPATH_W) '$(srcdir)/fugue.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='fugue.c' object='ccminer-fugue.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'fugue.c'; then $(CYGPATH_W) 'fugue.c'; else $(CYGPATH_W) '$(srcdir)/fugue.c'; fi`
+
+.cpp.o:
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cpp.obj:
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+ccminer-fuguecoin.o: fuguecoin.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-fuguecoin.o -MD -MP -MF $(DEPDIR)/ccminer-fuguecoin.Tpo -c -o ccminer-fuguecoin.o `test -f 'fuguecoin.cpp' || echo '$(srcdir)/'`fuguecoin.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-fuguecoin.Tpo $(DEPDIR)/ccminer-fuguecoin.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fuguecoin.cpp' object='ccminer-fuguecoin.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-fuguecoin.o `test -f 'fuguecoin.cpp' || echo '$(srcdir)/'`fuguecoin.cpp
+
+ccminer-fuguecoin.obj: fuguecoin.cpp
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-fuguecoin.obj -MD -MP -MF $(DEPDIR)/ccminer-fuguecoin.Tpo -c -o ccminer-fuguecoin.obj `if test -f 'fuguecoin.cpp'; then $(CYGPATH_W) 'fuguecoin.cpp'; else $(CYGPATH_W) '$(srcdir)/fuguecoin.cpp'; fi`
+@am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/ccminer-fuguecoin.Tpo $(DEPDIR)/ccminer-fuguecoin.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fuguecoin.cpp' object='ccminer-fuguecoin.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-fuguecoin.obj `if test -f 'fuguecoin.cpp'; then $(CYGPATH_W) 'fuguecoin.cpp'; else $(CYGPATH_W) '$(srcdir)/fuguecoin.cpp'; fi`
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+#     (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+	@fail=; \
+	if $(am__make_keepgoing); then \
+	  failcom='fail=yes'; \
+	else \
+	  failcom='exit 1'; \
+	fi; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-recursive
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-recursive
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscope: cscope.files
+	test ! -s cscope.files \
+	  || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
+clean-cscope:
+	-rm -f cscope.files
+cscope.files: clean-cscope cscopelist
+cscopelist: cscopelist-recursive
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+	-rm -f cscope.out cscope.in.out cscope.po.out cscope.files
+
+distdir: $(DISTFILES)
+	$(am__remove_distdir)
+	test -d "$(distdir)" || mkdir "$(distdir)"
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	-test -n "$(am__skip_mode_fix)" \
+	|| find "$(distdir)" -type d ! -perm -755 \
+		-exec chmod u+rwx,go+rx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r "$(distdir)"
+dist-gzip: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__post_remove_distdir)
+
+dist-bzip2: distdir
+	tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
+	$(am__post_remove_distdir)
+
+dist-lzip: distdir
+	tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
+	$(am__post_remove_distdir)
+
+dist-xz: distdir
+	tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
+	$(am__post_remove_distdir)
+
+dist-tarZ: distdir
+	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
+	$(am__post_remove_distdir)
+
+dist-shar: distdir
+	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
+	$(am__post_remove_distdir)
+
+dist-zip: distdir
+	-rm -f $(distdir).zip
+	zip -rq $(distdir).zip $(distdir)
+	$(am__post_remove_distdir)
+
+dist dist-all:
+	$(MAKE) $(AM_MAKEFLAGS) $(DIST_TARGETS) am__post_remove_distdir='@:'
+	$(am__post_remove_distdir)
+
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	case '$(DIST_ARCHIVES)' in \
+	*.tar.gz*) \
+	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
+	*.tar.bz2*) \
+	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
+	*.tar.lz*) \
+	  lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
+	*.tar.xz*) \
+	  xz -dc $(distdir).tar.xz | $(am__untar) ;;\
+	*.tar.Z*) \
+	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
+	*.shar.gz*) \
+	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
+	*.zip*) \
+	  unzip $(distdir).zip ;;\
+	esac
+	chmod -R a-w $(distdir)
+	chmod u+w $(distdir)
+	mkdir $(distdir)/_build $(distdir)/_inst
+	chmod a-w $(distdir)
+	test -d $(distdir)/_build || exit 0; \
+	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
+	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
+	  && am__cwd=`pwd` \
+	  && $(am__cd) $(distdir)/_build \
+	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
+	    $(DISTCHECK_CONFIGURE_FLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
+	        distuninstallcheck \
+	  && chmod -R a-w "$$dc_install_base" \
+	  && ({ \
+	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
+	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
+	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
+	  && rm -rf "$$dc_destdir" \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && rm -rf $(DIST_ARCHIVES) \
+	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \
+	  && cd "$$am__cwd" \
+	  || exit 1
+	$(am__post_remove_distdir)
+	@(echo "$(distdir) archives ready for distribution: "; \
+	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
+	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
+distuninstallcheck:
+	@test -n '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: trying to run $@ with an empty' \
+	       '$$(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	$(am__cd) '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \
+	   || { echo "ERROR: files left after uninstall:" ; \
+	        if test -n "$(DESTDIR)"; then \
+	          echo "  (check DESTDIR support)"; \
+	        fi ; \
+	        $(distuninstallcheck_listfiles) ; \
+	        exit 1; } >&2
+distcleancheck: distclean
+	@if test '$(srcdir)' = . ; then \
+	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
+	  exit 1 ; \
+	fi
+	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
+	  || { echo "ERROR: files left in build directory after distclean:" ; \
+	       $(distcleancheck_listfiles) ; \
+	       exit 1; } >&2
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(PROGRAMS) cpuminer-config.h
+installdirs: installdirs-recursive
+installdirs-am:
+	for dir in "$(DESTDIR)$(bindir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-hdr distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am: install-binPROGRAMS
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf $(top_srcdir)/autom4te.cache
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am: uninstall-binPROGRAMS
+
+.MAKE: $(am__recursive_targets) all install-am install-strip
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
+	am--refresh check check-am clean clean-binPROGRAMS \
+	clean-cscope clean-generic cscope cscopelist-am ctags ctags-am \
+	dist dist-all dist-bzip2 dist-gzip dist-lzip dist-shar \
+	dist-tarZ dist-xz dist-zip distcheck distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-tags distcleancheck distdir distuninstallcheck dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-binPROGRAMS install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic pdf pdf-am \
+	ps ps-am tags tags-am uninstall uninstall-am \
+	uninstall-binPROGRAMS
+
+
+.cu.o:
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
+#heavy.o: heavy.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+#
+#cuda_hefty1.o: cuda_hefty1.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..d3f5a12
--- /dev/null
+++ b/NEWS
@@ -0,0 +1 @@
+
diff --git a/README b/README
new file mode 100644
index 0000000..f5c26fc
--- /dev/null
+++ b/README
@@ -0,0 +1,3 @@
+
+A CUDA based miner for Heavycoin and Fuguecoin. For more information, take a look at README.txt
+
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..1298197
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,117 @@
+
+HeavyCUDA release Mar 18th 2014 - Initial Release
+-------------------------------------------------------------
+
+***************************************************************
+If you find this tool useful and like to support its continued 
+          development, then consider a donation.
+
+   LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm
+   BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM
+   YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4
+   VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a
+   MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt
+  DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
+ PANDA donation address: PvgtxJ2ZKaudRogCXfUMLXVaWUMcKQgRed
+   MRC donation address: 1Lxc4JPDpQRJB8BN4YwhmSQ3Rcu8gjj2Kd
+   HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
+***************************************************************
+
+>>> Introduction <<<
+
+This is a CUDA accelerated mining application for use with
+HeavyCoin and FugueCoin. We did not take effort on usability,
+so please set your parameters carefuly.
+
+THIS PROGRAMM IS PROVIDED "AS-IS", USE IT AT YOUR OWN RISK!
+
+If you're interessted and read the source-code, please excuse
+that the most of our comments are in german.
+
+>>> Command Line Interface <<<
+
+This code is based on the pooler cpuminer 2.3.2 release and inherits
+its command line interface and options.
+
+  -a, --algo=ALGO       specify the algorithm to use
+                          heavy       use to mine Heavycoin
+                          fugue256    use to mine Fuguecoin
+
+  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
+  -O, --userpass=U:P    username:password pair for mining server
+  -u, --user=USERNAME   username for mining server
+  -p, --pass=PASSWORD   password for mining server
+  -v, --vote		Heavycoin block vote (default: 512)
+      --cert=FILE       certificate for mining server using SSL
+  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy
+  -t, --threads=N       number of miner threads (default: number of nVidia GPUs in your system)
+  -r, --retries=N       number of times to retry if a network call fails
+                          (default: retry indefinitely)
+  -R, --retry-pause=N   time to pause between retries, in seconds (default: 15)
+  -T, --timeout=N       network timeout, in seconds (default: 270)
+  -s, --scantime=N      upper bound on time spent scanning current work when
+                        long polling is unavailable, in seconds (default: 5)
+      --no-longpoll     disable X-Long-Polling support
+      --no-stratum      disable X-Stratum support
+  -q, --quiet           disable per-thread hashmeter output
+  -D, --debug           enable debug output
+  -P, --protocol-dump   verbose dump of protocol-level activities
+  -B, --background      run the miner in the background
+      --benchmark       run in offline benchmark mode
+  -c, --config=FILE     load a JSON-format configuration file
+  -V, --version         display version information and exit
+  -h, --help            display this help text and exit
+
+>>> Examples <<<
+
+Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
+
+cudaminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
+
+
+
+Example for Heavycoin Mining on hvc.1gh.com with a dual gpu in your system
+
+cudaminer.exe -t 2 -a heavy -o stratum+tcp://hvcpool.1gh.com:5333 -u <<WALLET>> -p x -v 512
+
+
+
+Example for Fuguecoin solo-mining with 4 gpu's in your system and a Fuguecoin-wallet running on localhost
+
+cudaminer.exe -q -s 1 -t 4 -a fugue256 -o http://localhost:9089 -u <<myusername>> -p <<mypassword>>
+
+For solo-mining you typically use -o 127.0.0.1:xxxx where xxxx represents
+the RPC portnumber specified in your wallet's .conf file and you have to
+pass the same username and password with -O as specified in the wallet's
+.conf file. The wallet must also be started with the -server option and
+the server flag in the wallet's .conf file set to 1
+
+
+>>> Additional Notes <<<
+
+This code should be running on nVidia GPUs ranging from compute capability
+2.0 up to compute capability 3.5. Just don't expect any hashing miracles
+from your old clunkers.
+
+>>> RELEASE HISTORY <<<
+
+  March, 18 2014 initial release.
+
+
+>>> AUTHORS <<<
+
+Notable contributors to this application are:
+
+Christian Buchner, Christian H. (Germany): CUDA implementation 
+
+and also many thanks to anyone else who contributed to the original
+cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
+and the HVC-fork available at hvc.1gh.com
+
+Source code is included to satisfy GNU GPL V2 requirements.
+
+
+With kind regards,
+
+   Christian Buchner ( Christian.Buchner@gmail.com )
+   Christian H. ( Chris84 )
diff --git a/aclocal.m4 b/aclocal.m4
new file mode 100644
index 0000000..1b2c558
--- /dev/null
+++ b/aclocal.m4
@@ -0,0 +1,1379 @@
+# generated automatically by aclocal 1.13.3 -*- Autoconf -*-
+
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
+m4_ifndef([AC_AUTOCONF_VERSION],
+  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
+[m4_warning([this file was generated for autoconf 2.69.
+You have another version of autoconf.  It may work, but is not guaranteed to.
+If you have problems, you may need to regenerate the build system entirely.
+To do so, use the procedure documented by the package, typically 'autoreconf'.])])
+
+# LIBCURL_CHECK_CONFIG ([DEFAULT-ACTION], [MINIMUM-VERSION],
+#                       [ACTION-IF-YES], [ACTION-IF-NO])
+# ----------------------------------------------------------
+#      David Shaw <dshaw@jabberwocky.com>   May-09-2006
+#
+# Checks for libcurl.  DEFAULT-ACTION is the string yes or no to
+# specify whether to default to --with-libcurl or --without-libcurl.
+# If not supplied, DEFAULT-ACTION is yes.  MINIMUM-VERSION is the
+# minimum version of libcurl to accept.  Pass the version as a regular
+# version number like 7.10.1. If not supplied, any version is
+# accepted.  ACTION-IF-YES is a list of shell commands to run if
+# libcurl was successfully found and passed the various tests.
+# ACTION-IF-NO is a list of shell commands that are run otherwise.
+# Note that using --without-libcurl does run ACTION-IF-NO.
+#
+# This macro #defines HAVE_LIBCURL if a working libcurl setup is
+# found, and sets @LIBCURL@ and @LIBCURL_CPPFLAGS@ to the necessary
+# values.  Other useful defines are LIBCURL_FEATURE_xxx where xxx are
+# the various features supported by libcurl, and LIBCURL_PROTOCOL_yyy
+# where yyy are the various protocols supported by libcurl.  Both xxx
+# and yyy are capitalized.  See the list of AH_TEMPLATEs at the top of
+# the macro for the complete list of possible defines.  Shell
+# variables $libcurl_feature_xxx and $libcurl_protocol_yyy are also
+# defined to 'yes' for those features and protocols that were found.
+# Note that xxx and yyy keep the same capitalization as in the
+# curl-config list (e.g. it's "HTTP" and not "http").
+#
+# Users may override the detected values by doing something like:
+# LIBCURL="-lcurl" LIBCURL_CPPFLAGS="-I/usr/myinclude" ./configure
+#
+# For the sake of sanity, this macro assumes that any libcurl that is
+# found is after version 7.7.2, the first version that included the
+# curl-config script.  Note that it is very important for people
+# packaging binary versions of libcurl to include this script!
+# Without curl-config, we can only guess what protocols are available,
+# or use curl_version_info to figure it out at runtime.
+
+AC_DEFUN([LIBCURL_CHECK_CONFIG],
+[
+  AH_TEMPLATE([LIBCURL_FEATURE_SSL],[Defined if libcurl supports SSL])
+  AH_TEMPLATE([LIBCURL_FEATURE_KRB4],[Defined if libcurl supports KRB4])
+  AH_TEMPLATE([LIBCURL_FEATURE_IPV6],[Defined if libcurl supports IPv6])
+  AH_TEMPLATE([LIBCURL_FEATURE_LIBZ],[Defined if libcurl supports libz])
+  AH_TEMPLATE([LIBCURL_FEATURE_ASYNCHDNS],[Defined if libcurl supports AsynchDNS])
+  AH_TEMPLATE([LIBCURL_FEATURE_IDN],[Defined if libcurl supports IDN])
+  AH_TEMPLATE([LIBCURL_FEATURE_SSPI],[Defined if libcurl supports SSPI])
+  AH_TEMPLATE([LIBCURL_FEATURE_NTLM],[Defined if libcurl supports NTLM])
+
+  AH_TEMPLATE([LIBCURL_PROTOCOL_HTTP],[Defined if libcurl supports HTTP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_HTTPS],[Defined if libcurl supports HTTPS])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_FTP],[Defined if libcurl supports FTP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_FTPS],[Defined if libcurl supports FTPS])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_FILE],[Defined if libcurl supports FILE])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_TELNET],[Defined if libcurl supports TELNET])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_LDAP],[Defined if libcurl supports LDAP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_DICT],[Defined if libcurl supports DICT])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_TFTP],[Defined if libcurl supports TFTP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_RTSP],[Defined if libcurl supports RTSP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_POP3],[Defined if libcurl supports POP3])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_IMAP],[Defined if libcurl supports IMAP])
+  AH_TEMPLATE([LIBCURL_PROTOCOL_SMTP],[Defined if libcurl supports SMTP])
+
+  AC_ARG_WITH(libcurl,
+     AC_HELP_STRING([--with-libcurl=PREFIX],[look for the curl library in PREFIX/lib and headers in PREFIX/include]),
+     [_libcurl_with=$withval],[_libcurl_with=ifelse([$1],,[yes],[$1])])
+
+  if test "$_libcurl_with" != "no" ; then
+
+     AC_PROG_AWK
+
+     _libcurl_version_parse="eval $AWK '{split(\$NF,A,\".\"); X=256*256*A[[1]]+256*A[[2]]+A[[3]]; print X;}'"
+
+     _libcurl_try_link=yes
+
+     if test -d "$_libcurl_with" ; then
+        LIBCURL_CPPFLAGS="-I$withval/include"
+        _libcurl_ldflags="-L$withval/lib"
+        AC_PATH_PROG([_libcurl_config],[curl-config],[],
+                     ["$withval/bin"])
+     else
+        AC_PATH_PROG([_libcurl_config],[curl-config],[],[$PATH])
+     fi
+
+     if test x$_libcurl_config != "x" ; then
+        AC_CACHE_CHECK([for the version of libcurl],
+           [libcurl_cv_lib_curl_version],
+           [libcurl_cv_lib_curl_version=`$_libcurl_config --version | $AWK '{print $[]2}'`])
+
+        _libcurl_version=`echo $libcurl_cv_lib_curl_version | $_libcurl_version_parse`
+        _libcurl_wanted=`echo ifelse([$2],,[0],[$2]) | $_libcurl_version_parse`
+
+        if test $_libcurl_wanted -gt 0 ; then
+           AC_CACHE_CHECK([for libcurl >= version $2],
+              [libcurl_cv_lib_version_ok],
+              [
+              if test $_libcurl_version -ge $_libcurl_wanted ; then
+                 libcurl_cv_lib_version_ok=yes
+              else
+                 libcurl_cv_lib_version_ok=no
+              fi
+              ])
+        fi
+
+        if test $_libcurl_wanted -eq 0 || test x$libcurl_cv_lib_version_ok = xyes ; then
+           if test x"$LIBCURL_CPPFLAGS" = "x" ; then
+              LIBCURL_CPPFLAGS=`$_libcurl_config --cflags`
+           fi
+           if test x"$LIBCURL" = "x" ; then
+              LIBCURL=`$_libcurl_config --libs`
+
+              # This is so silly, but Apple actually has a bug in their
+              # curl-config script.  Fixed in Tiger, but there are still
+              # lots of Panther installs around.
+              case "${host}" in
+                 powerpc-apple-darwin7*)
+                    LIBCURL=`echo $LIBCURL | sed -e 's|-arch i386||g'`
+                 ;;
+              esac
+           fi
+
+           # All curl-config scripts support --feature
+           _libcurl_features=`$_libcurl_config --feature`
+
+           # Is it modern enough to have --protocols? (7.12.4)
+           if test $_libcurl_version -ge 461828 ; then
+              _libcurl_protocols=`$_libcurl_config --protocols`
+           fi
+        else
+           _libcurl_try_link=no
+        fi
+
+        unset _libcurl_wanted
+     fi
+
+     if test $_libcurl_try_link = yes ; then
+
+        # we didn't find curl-config, so let's see if the user-supplied
+        # link line (or failing that, "-lcurl") is enough.
+        LIBCURL=${LIBCURL-"$_libcurl_ldflags -lcurl"}
+
+        AC_CACHE_CHECK([whether libcurl is usable],
+           [libcurl_cv_lib_curl_usable],
+           [
+           _libcurl_save_cppflags=$CPPFLAGS
+           CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS"
+           _libcurl_save_libs=$LIBS
+           LIBS="$LIBCURL $LIBS"
+
+           AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <curl/curl.h>]],[[
+/* Try and use a few common options to force a failure if we are
+   missing symbols or can't link. */
+int x;
+curl_easy_setopt(NULL,CURLOPT_URL,NULL);
+x=CURL_ERROR_SIZE;
+x=CURLOPT_WRITEFUNCTION;
+x=CURLOPT_FILE;
+x=CURLOPT_ERRORBUFFER;
+x=CURLOPT_STDERR;
+x=CURLOPT_VERBOSE;
+if (x) ;
+]])],libcurl_cv_lib_curl_usable=yes,libcurl_cv_lib_curl_usable=no)
+
+           CPPFLAGS=$_libcurl_save_cppflags
+           LIBS=$_libcurl_save_libs
+           unset _libcurl_save_cppflags
+           unset _libcurl_save_libs
+           ])
+
+        if test $libcurl_cv_lib_curl_usable = yes ; then
+
+           # Does curl_free() exist in this version of libcurl?
+           # If not, fake it with free()
+
+           _libcurl_save_cppflags=$CPPFLAGS
+           CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
+           _libcurl_save_libs=$LIBS
+           LIBS="$LIBS $LIBCURL"
+
+           AC_CHECK_FUNC(curl_free,,
+              AC_DEFINE(curl_free,free,
+                [Define curl_free() as free() if our version of curl lacks curl_free.]))
+
+           CPPFLAGS=$_libcurl_save_cppflags
+           LIBS=$_libcurl_save_libs
+           unset _libcurl_save_cppflags
+           unset _libcurl_save_libs
+
+           AC_DEFINE(HAVE_LIBCURL,1,
+             [Define to 1 if you have a functional curl library.])
+           AC_SUBST(LIBCURL_CPPFLAGS)
+           AC_SUBST(LIBCURL)
+
+           for _libcurl_feature in $_libcurl_features ; do
+              AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_feature_$_libcurl_feature),[1])
+              eval AS_TR_SH(libcurl_feature_$_libcurl_feature)=yes
+           done
+
+           if test "x$_libcurl_protocols" = "x" ; then
+
+              # We don't have --protocols, so just assume that all
+              # protocols are available
+              _libcurl_protocols="HTTP FTP FILE TELNET LDAP DICT TFTP"
+
+              if test x$libcurl_feature_SSL = xyes ; then
+                 _libcurl_protocols="$_libcurl_protocols HTTPS"
+
+                 # FTPS wasn't standards-compliant until version
+                 # 7.11.0 (0x070b00 == 461568)
+                 if test $_libcurl_version -ge 461568; then
+                    _libcurl_protocols="$_libcurl_protocols FTPS"
+                 fi
+              fi
+
+              # RTSP, IMAP, POP3 and SMTP were added in
+              # 7.20.0 (0x071400 == 463872)
+              if test $_libcurl_version -ge 463872; then
+                 _libcurl_protocols="$_libcurl_protocols RTSP IMAP POP3 SMTP"
+              fi
+           fi
+
+           for _libcurl_protocol in $_libcurl_protocols ; do
+              AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_protocol_$_libcurl_protocol),[1])
+              eval AS_TR_SH(libcurl_protocol_$_libcurl_protocol)=yes
+           done
+        else
+           unset LIBCURL
+           unset LIBCURL_CPPFLAGS
+        fi
+     fi
+
+     unset _libcurl_try_link
+     unset _libcurl_version_parse
+     unset _libcurl_config
+     unset _libcurl_feature
+     unset _libcurl_features
+     unset _libcurl_protocol
+     unset _libcurl_protocols
+     unset _libcurl_version
+     unset _libcurl_ldflags
+  fi
+
+  if test x$_libcurl_with = xno || test x$libcurl_cv_lib_curl_usable != xyes ; then
+     # This is the IF-NO path
+     ifelse([$4],,:,[$4])
+  else
+     # This is the IF-YES path
+     ifelse([$3],,:,[$3])
+  fi
+
+  unset _libcurl_with
+])dnl
+
+# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_AUTOMAKE_VERSION(VERSION)
+# ----------------------------
+# Automake X.Y traces this macro to ensure aclocal.m4 has been
+# generated from the m4 files accompanying Automake X.Y.
+# (This private macro should not be called outside this file.)
+AC_DEFUN([AM_AUTOMAKE_VERSION],
+[am__api_version='1.13'
+dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
+dnl require some minimum version.  Point them to the right macro.
+m4_if([$1], [1.13.3], [],
+      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
+])
+
+# _AM_AUTOCONF_VERSION(VERSION)
+# -----------------------------
+# aclocal traces this macro to find the Autoconf version.
+# This is a private macro too.  Using m4_define simplifies
+# the logic in aclocal, which can simply ignore this definition.
+m4_define([_AM_AUTOCONF_VERSION], [])
+
+# AM_SET_CURRENT_AUTOMAKE_VERSION
+# -------------------------------
+# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
+# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
+AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
+[AM_AUTOMAKE_VERSION([1.13.3])dnl
+m4_ifndef([AC_AUTOCONF_VERSION],
+  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
+_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
+
+# Figure out how to run the assembler.                      -*- Autoconf -*-
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_AS
+# ----------
+AC_DEFUN([AM_PROG_AS],
+[# By default we simply use the C compiler to build assembly code.
+AC_REQUIRE([AC_PROG_CC])
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+AC_ARG_VAR([CCAS],      [assembler compiler command (defaults to CC)])
+AC_ARG_VAR([CCASFLAGS], [assembler compiler flags (defaults to CFLAGS)])
+_AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
+])
+
+# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
+# $ac_aux_dir to '$srcdir/foo'.  In other projects, it is set to
+# '$srcdir', '$srcdir/..', or '$srcdir/../..'.
+#
+# Of course, Automake must honor this variable whenever it calls a
+# tool from the auxiliary directory.  The problem is that $srcdir (and
+# therefore $ac_aux_dir as well) can be either absolute or relative,
+# depending on how configure is run.  This is pretty annoying, since
+# it makes $ac_aux_dir quite unusable in subdirectories: in the top
+# source directory, any form will work fine, but in subdirectories a
+# relative path needs to be adjusted first.
+#
+# $ac_aux_dir/missing
+#    fails when called from a subdirectory if $ac_aux_dir is relative
+# $top_srcdir/$ac_aux_dir/missing
+#    fails if $ac_aux_dir is absolute,
+#    fails when called from a subdirectory in a VPATH build with
+#          a relative $ac_aux_dir
+#
+# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
+# are both prefixed by $srcdir.  In an in-source build this is usually
+# harmless because $srcdir is '.', but things will broke when you
+# start a VPATH build or use an absolute $srcdir.
+#
+# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
+# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
+#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
+# and then we would define $MISSING as
+#   MISSING="\${SHELL} $am_aux_dir/missing"
+# This will work as long as MISSING is not called from configure, because
+# unfortunately $(top_srcdir) has no meaning in configure.
+# However there are other variables, like CC, which are often used in
+# configure, and could therefore not use this "fixed" $ac_aux_dir.
+#
+# Another solution, used here, is to always expand $ac_aux_dir to an
+# absolute PATH.  The drawback is that using absolute paths prevent a
+# configured tree to be moved without reconfiguration.
+
+AC_DEFUN([AM_AUX_DIR_EXPAND],
+[dnl Rely on autoconf to set up CDPATH properly.
+AC_PREREQ([2.50])dnl
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+])
+
+# AM_CONDITIONAL                                            -*- Autoconf -*-
+
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_CONDITIONAL(NAME, SHELL-CONDITION)
+# -------------------------------------
+# Define a conditional.
+AC_DEFUN([AM_CONDITIONAL],
+[AC_PREREQ([2.52])dnl
+ m4_if([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
+       [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
+AC_SUBST([$1_TRUE])dnl
+AC_SUBST([$1_FALSE])dnl
+_AM_SUBST_NOTMAKE([$1_TRUE])dnl
+_AM_SUBST_NOTMAKE([$1_FALSE])dnl
+m4_define([_AM_COND_VALUE_$1], [$2])dnl
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi
+AC_CONFIG_COMMANDS_PRE(
+[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
+  AC_MSG_ERROR([[conditional "$1" was never defined.
+Usually this means the macro was only invoked conditionally.]])
+fi])])
+
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+
+# There are a few dirty hacks below to avoid letting 'AC_PROG_CC' be
+# written in clear, in which case automake, when reading aclocal.m4,
+# will think it sees a *use*, and therefore will trigger all it's
+# C support machinery.  Also note that it means that autoscan, seeing
+# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+
+
+# _AM_DEPENDENCIES(NAME)
+# ----------------------
+# See how the compiler implements dependency checking.
+# NAME is "CC", "CXX", "OBJC", "OBJCXX", "UPC", or "GJC".
+# We try a few techniques and use that to set a single cache variable.
+#
+# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
+# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
+# dependency, and given that the user is not expected to run this macro,
+# just rely on AC_PROG_CC.
+AC_DEFUN([_AM_DEPENDENCIES],
+[AC_REQUIRE([AM_SET_DEPDIR])dnl
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
+AC_REQUIRE([AM_MAKE_INCLUDE])dnl
+AC_REQUIRE([AM_DEP_TRACK])dnl
+
+m4_if([$1], [CC],   [depcc="$CC"   am_compiler_list=],
+      [$1], [CXX],  [depcc="$CXX"  am_compiler_list=],
+      [$1], [OBJC], [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
+      [$1], [OBJCXX], [depcc="$OBJCXX" am_compiler_list='gcc3 gcc'],
+      [$1], [UPC],  [depcc="$UPC"  am_compiler_list=],
+      [$1], [GCJ],  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
+                    [depcc="$$1"   am_compiler_list=])
+
+AC_CACHE_CHECK([dependency style of $depcc],
+               [am_cv_$1_dependencies_compiler_type],
+[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_$1_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
+  fi
+  am__universal=false
+  m4_case([$1], [CC],
+    [case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac],
+    [CXX],
+    [case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac])
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_$1_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_$1_dependencies_compiler_type=none
+fi
+])
+AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
+AM_CONDITIONAL([am__fastdep$1], [
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
+])
+
+
+# AM_SET_DEPDIR
+# -------------
+# Choose a directory name for dependency files.
+# This macro is AC_REQUIREd in _AM_DEPENDENCIES.
+AC_DEFUN([AM_SET_DEPDIR],
+[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
+])
+
+
+# AM_DEP_TRACK
+# ------------
+AC_DEFUN([AM_DEP_TRACK],
+[AC_ARG_ENABLE([dependency-tracking], [dnl
+AS_HELP_STRING(
+  [--enable-dependency-tracking],
+  [do not reject slow dependency extractors])
+AS_HELP_STRING(
+  [--disable-dependency-tracking],
+  [speeds up one-time build])])
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
+AC_SUBST([AMDEPBACKSLASH])dnl
+_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
+AC_SUBST([am__nodep])dnl
+_AM_SUBST_NOTMAKE([am__nodep])dnl
+])
+
+# Generate code to set up dependency tracking.              -*- Autoconf -*-
+
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+
+# _AM_OUTPUT_DEPENDENCY_COMMANDS
+# ------------------------------
+AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
+[{
+  # Older Autoconf quotes --file arguments for eval, but not when files
+  # are listed without --file.  Let's play safe and only enable the eval
+  # if we detect the quoting.
+  case $CONFIG_FILES in
+  *\'*) eval set x "$CONFIG_FILES" ;;
+  *)   set x $CONFIG_FILES ;;
+  esac
+  shift
+  for mf
+  do
+    # Strip MF so we end up with the name of the file.
+    mf=`echo "$mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile or not.
+    # We used to match only the files named 'Makefile.in', but
+    # some people rename them; so instead we look at the file content.
+    # Grep'ing the first line is not enough: some people post-process
+    # each Makefile.in and add a new line on top of each file to say so.
+    # Grep'ing the whole file is not good either: AIX grep has a line
+    # limit of 2048, but all sed's we know have understand at least 4000.
+    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
+      dirpart=`AS_DIRNAME("$mf")`
+    else
+      continue
+    fi
+    # Extract the definition of DEPDIR, am__include, and am__quote
+    # from the Makefile without running 'make'.
+    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+    test -z "$DEPDIR" && continue
+    am__include=`sed -n 's/^am__include = //p' < "$mf"`
+    test -z "$am__include" && continue
+    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+    # Find all dependency output files, they are included files with
+    # $(DEPDIR) in their names.  We invoke sed twice because it is the
+    # simplest approach to changing $(DEPDIR) to its actual value in the
+    # expansion.
+    for file in `sed -n "
+      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
+      # Make sure the directory exists.
+      test -f "$dirpart/$file" && continue
+      fdir=`AS_DIRNAME(["$file"])`
+      AS_MKDIR_P([$dirpart/$fdir])
+      # echo "creating $dirpart/$file"
+      echo '# dummy' > "$dirpart/$file"
+    done
+  done
+}
+])# _AM_OUTPUT_DEPENDENCY_COMMANDS
+
+
+# AM_OUTPUT_DEPENDENCY_COMMANDS
+# -----------------------------
+# This macro should only be invoked once -- use via AC_REQUIRE.
+#
+# This code is only required when automatic dependency tracking
+# is enabled.  FIXME.  This creates each '.P' file that we will
+# need in order to bootstrap the dependency handling code.
+AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
+[AC_CONFIG_COMMANDS([depfiles],
+     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
+     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
+])
+
+# Do all the work for Automake.                             -*- Autoconf -*-
+
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This macro actually does too much.  Some checks are only needed if
+# your package does certain things.  But this isn't really a big deal.
+
+# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
+# AM_INIT_AUTOMAKE([OPTIONS])
+# -----------------------------------------------
+# The call with PACKAGE and VERSION arguments is the old style
+# call (pre autoconf-2.50), which is being phased out.  PACKAGE
+# and VERSION should now be passed to AC_INIT and removed from
+# the call to AM_INIT_AUTOMAKE.
+# We support both call styles for the transition.  After
+# the next Automake release, Autoconf can make the AC_INIT
+# arguments mandatory, and then we can depend on a new Autoconf
+# release and drop the old call support.
+AC_DEFUN([AM_INIT_AUTOMAKE],
+[AC_PREREQ([2.65])dnl
+dnl Autoconf wants to disallow AM_ names.  We explicitly allow
+dnl the ones we care about.
+m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
+AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
+AC_REQUIRE([AC_PROG_INSTALL])dnl
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+AC_SUBST([CYGPATH_W])
+
+# Define the identity of the package.
+dnl Distinguish between old-style and new-style calls.
+m4_ifval([$2],
+[AC_DIAGNOSE([obsolete],
+             [$0: two- and three-arguments forms are deprecated.])
+m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
+ AC_SUBST([PACKAGE], [$1])dnl
+ AC_SUBST([VERSION], [$2])],
+[_AM_SET_OPTIONS([$1])dnl
+dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
+m4_if(
+  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
+  [ok:ok],,
+  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
+ AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
+ AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
+
+_AM_IF_OPTION([no-define],,
+[AC_DEFINE_UNQUOTED([PACKAGE], ["$PACKAGE"], [Name of package])
+ AC_DEFINE_UNQUOTED([VERSION], ["$VERSION"], [Version number of package])])dnl
+
+# Some tools Automake needs.
+AC_REQUIRE([AM_SANITY_CHECK])dnl
+AC_REQUIRE([AC_ARG_PROGRAM])dnl
+AM_MISSING_PROG([ACLOCAL], [aclocal-${am__api_version}])
+AM_MISSING_PROG([AUTOCONF], [autoconf])
+AM_MISSING_PROG([AUTOMAKE], [automake-${am__api_version}])
+AM_MISSING_PROG([AUTOHEADER], [autoheader])
+AM_MISSING_PROG([MAKEINFO], [makeinfo])
+AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl
+AC_REQUIRE([AC_PROG_MKDIR_P])dnl
+# For better backward compatibility.  To be removed once Automake 1.9.x
+# dies out for good.  For more background, see:
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+AC_REQUIRE([AC_PROG_AWK])dnl
+AC_REQUIRE([AC_PROG_MAKE_SET])dnl
+AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
+	      [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
+			     [_AM_PROG_TAR([v7])])])
+_AM_IF_OPTION([no-dependencies],,
+[AC_PROVIDE_IFELSE([AC_PROG_CC],
+		  [_AM_DEPENDENCIES([CC])],
+		  [m4_define([AC_PROG_CC],
+			     m4_defn([AC_PROG_CC])[_AM_DEPENDENCIES([CC])])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_CXX],
+		  [_AM_DEPENDENCIES([CXX])],
+		  [m4_define([AC_PROG_CXX],
+			     m4_defn([AC_PROG_CXX])[_AM_DEPENDENCIES([CXX])])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_OBJC],
+		  [_AM_DEPENDENCIES([OBJC])],
+		  [m4_define([AC_PROG_OBJC],
+			     m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
+		  [_AM_DEPENDENCIES([OBJCXX])],
+		  [m4_define([AC_PROG_OBJCXX],
+			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
+])
+AC_REQUIRE([AM_SILENT_RULES])dnl
+dnl The testsuite driver may need to know about EXEEXT, so add the
+dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
+dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below.
+AC_CONFIG_COMMANDS_PRE(dnl
+[m4_provide_if([_AM_COMPILER_EXEEXT],
+  [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
+])
+
+dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
+dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
+dnl mangled by Autoconf and run in a shell conditional statement.
+m4_define([_AC_COMPILER_EXEEXT],
+m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])
+
+
+# When config.status generates a header, we must update the stamp-h file.
+# This file resides in the same directory as the config header
+# that is generated.  The stamp files are numbered to have different names.
+
+# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
+# loop where config.status creates the headers, so we can generate
+# our stamp files there.
+AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
+[# Compute $1's index in $config_headers.
+_am_arg=$1
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $_am_arg | $_am_arg:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_INSTALL_SH
+# ------------------
+# Define $install_sh.
+AC_DEFUN([AM_PROG_INSTALL_SH],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+if test x"${install_sh}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
+  *)
+    install_sh="\${SHELL} $am_aux_dir/install-sh"
+  esac
+fi
+AC_SUBST([install_sh])])
+
+# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# Check whether the underlying file-system supports filenames
+# with a leading dot.  For instance MS-DOS doesn't.
+AC_DEFUN([AM_SET_LEADING_DOT],
+[rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+AC_SUBST([am__leading_dot])])
+
+# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
+# From Jim Meyering
+
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_MAINTAINER_MODE([DEFAULT-MODE])
+# ----------------------------------
+# Control maintainer-specific portions of Makefiles.
+# Default is to disable them, unless 'enable' is passed literally.
+# For symmetry, 'disable' may be passed as well.  Anyway, the user
+# can override the default with the --enable/--disable switch.
+AC_DEFUN([AM_MAINTAINER_MODE],
+[m4_case(m4_default([$1], [disable]),
+       [enable], [m4_define([am_maintainer_other], [disable])],
+       [disable], [m4_define([am_maintainer_other], [enable])],
+       [m4_define([am_maintainer_other], [enable])
+        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
+AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
+  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
+  AC_ARG_ENABLE([maintainer-mode],
+    [AS_HELP_STRING([--]am_maintainer_other[-maintainer-mode],
+      am_maintainer_other[ make rules and dependencies not useful
+      (and sometimes confusing) to the casual installer])],
+    [USE_MAINTAINER_MODE=$enableval],
+    [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
+  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
+  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
+  MAINT=$MAINTAINER_MODE_TRUE
+  AC_SUBST([MAINT])dnl
+]
+)
+
+# Check to see how 'make' treats includes.	            -*- Autoconf -*-
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_MAKE_INCLUDE()
+# -----------------
+# Check to see how make treats includes.
+AC_DEFUN([AM_MAKE_INCLUDE],
+[am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+AC_MSG_CHECKING([for style of include used by $am_make])
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from 'make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
+AC_SUBST([am__include])
+AC_SUBST([am__quote])
+AC_MSG_RESULT([$_am_result])
+rm -f confinc confmf
+])
+
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_CC_C_O
+# --------------
+# Like AC_PROG_CC_C_O, but changed for automake.
+AC_DEFUN([AM_PROG_CC_C_O],
+[AC_REQUIRE([AC_PROG_CC_C_O])dnl
+AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([compile])dnl
+# FIXME: we rely on the cache variable name because
+# there is no other way.
+set dummy $CC
+am_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
+eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
+if test "$am_t" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+dnl Make sure AC_PROG_CC is never called again, or it will override our
+dnl setting of CC.
+m4_define([AC_PROG_CC],
+          [m4_fatal([AC_PROG_CC cannot be called after AM_PROG_CC_C_O])])
+])
+
+# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
+
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_MISSING_PROG(NAME, PROGRAM)
+# ------------------------------
+AC_DEFUN([AM_MISSING_PROG],
+[AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+
+# AM_MISSING_HAS_RUN
+# ------------------
+# Define MISSING if not defined so far and test if it is modern enough.
+# If it is, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([missing])dnl
+if test x"${MISSING+set}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
+fi
+# Use eval to expand $SHELL
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
+else
+  am_missing_run=
+  AC_MSG_WARN(['missing' script is too old or missing])
+fi
+])
+
+# Helper functions for option handling.                     -*- Autoconf -*-
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# _AM_MANGLE_OPTION(NAME)
+# -----------------------
+AC_DEFUN([_AM_MANGLE_OPTION],
+[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
+
+# _AM_SET_OPTION(NAME)
+# --------------------
+# Set option NAME.  Presently that only means defining a flag for this option.
+AC_DEFUN([_AM_SET_OPTION],
+[m4_define(_AM_MANGLE_OPTION([$1]), [1])])
+
+# _AM_SET_OPTIONS(OPTIONS)
+# ------------------------
+# OPTIONS is a space-separated list of Automake options.
+AC_DEFUN([_AM_SET_OPTIONS],
+[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
+
+# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
+# -------------------------------------------
+# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
+AC_DEFUN([_AM_IF_OPTION],
+[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
+
+# Check to make sure that the build environment is sane.    -*- Autoconf -*-
+
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_SANITY_CHECK
+# ---------------
+AC_DEFUN([AM_SANITY_CHECK],
+[AC_MSG_CHECKING([whether build environment is sane])
+# Reject unsafe characters in $srcdir or the absolute working directory
+# name.  Accept space and tab only in the latter.
+am_lf='
+'
+case `pwd` in
+  *[[\\\"\#\$\&\'\`$am_lf]]*)
+    AC_MSG_ERROR([unsafe absolute working directory name]);;
+esac
+case $srcdir in
+  *[[\\\"\#\$\&\'\`$am_lf\ \	]]*)
+    AC_MSG_ERROR([unsafe srcdir value: '$srcdir']);;
+esac
+
+# Do 'set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   am_has_slept=no
+   for am_try in 1 2; do
+     echo "timestamp, slept: $am_has_slept" > conftest.file
+     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+     if test "$[*]" = "X"; then
+	# -L didn't work.
+	set X `ls -t "$srcdir/configure" conftest.file`
+     fi
+     if test "$[*]" != "X $srcdir/configure conftest.file" \
+	&& test "$[*]" != "X conftest.file $srcdir/configure"; then
+
+	# If neither matched, then we have a broken ls.  This can happen
+	# if, for instance, CONFIG_SHELL is bash and it inherits a
+	# broken ls alias from the environment.  This has actually
+	# happened.  Such a system could not be considered "sane".
+	AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+  alias in your environment])
+     fi
+     if test "$[2]" = conftest.file || test $am_try -eq 2; then
+       break
+     fi
+     # Just in case.
+     sleep 1
+     am_has_slept=yes
+   done
+   test "$[2]" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+AC_MSG_RESULT([yes])
+# If we didn't sleep, we still need to ensure time stamps of config.status and
+# generated files are strictly newer.
+am_sleep_pid=
+if grep 'slept: no' conftest.file >/dev/null 2>&1; then
+  ( sleep 1 ) &
+  am_sleep_pid=$!
+fi
+AC_CONFIG_COMMANDS_PRE(
+  [AC_MSG_CHECKING([that generated files are newer than configure])
+   if test -n "$am_sleep_pid"; then
+     # Hide warnings about reused PIDs.
+     wait $am_sleep_pid 2>/dev/null
+   fi
+   AC_MSG_RESULT([done])])
+rm -f conftest.file
+])
+
+# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_SILENT_RULES([DEFAULT])
+# --------------------------
+# Enable less verbose build rules; with the default set to DEFAULT
+# ("yes" being less verbose, "no" or empty being verbose).
+AC_DEFUN([AM_SILENT_RULES],
+[AC_ARG_ENABLE([silent-rules], [dnl
+AS_HELP_STRING(
+  [--enable-silent-rules],
+  [less verbose build output (undo: "make V=1")])
+AS_HELP_STRING(
+  [--disable-silent-rules],
+  [verbose build output (undo: "make V=0")])dnl
+])
+case $enable_silent_rules in @%:@ (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);;
+esac
+dnl
+dnl A few 'make' implementations (e.g., NonStop OS and NextStep)
+dnl do not support nested variable expansions.
+dnl See automake bug#9928 and bug#10237.
+am_make=${MAKE-make}
+AC_CACHE_CHECK([whether $am_make supports nested variables],
+   [am_cv_make_support_nested_variables],
+   [if AS_ECHO([['TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit']]) | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi])
+if test $am_cv_make_support_nested_variables = yes; then
+  dnl Using '$V' instead of '$(V)' breaks IRIX make.
+  AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AC_SUBST([AM_V])dnl
+AM_SUBST_NOTMAKE([AM_V])dnl
+AC_SUBST([AM_DEFAULT_V])dnl
+AM_SUBST_NOTMAKE([AM_DEFAULT_V])dnl
+AC_SUBST([AM_DEFAULT_VERBOSITY])dnl
+AM_BACKSLASH='\'
+AC_SUBST([AM_BACKSLASH])dnl
+_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
+])
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_INSTALL_STRIP
+# ---------------------
+# One issue with vendor 'install' (even GNU) is that you can't
+# specify the program used to strip binaries.  This is especially
+# annoying in cross-compiling environments, where the build's strip
+# is unlikely to handle the host's binaries.
+# Fortunately install-sh will honor a STRIPPROG variable, so we
+# always use install-sh in "make install-strip", and initialize
+# STRIPPROG with the value of the STRIP variable (set by the user).
+AC_DEFUN([AM_PROG_INSTALL_STRIP],
+[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+# Installed binaries are usually stripped using 'strip' when the user
+# run "make install-strip".  However 'strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the 'STRIP' environment variable to overrule this program.
+dnl Don't test for $cross_compiling = yes, because it might be 'maybe'.
+if test "$cross_compiling" != no; then
+  AC_CHECK_TOOL([STRIP], [strip], :)
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+AC_SUBST([INSTALL_STRIP_PROGRAM])])
+
+# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# _AM_SUBST_NOTMAKE(VARIABLE)
+# ---------------------------
+# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in.
+# This macro is traced by Automake.
+AC_DEFUN([_AM_SUBST_NOTMAKE])
+
+# AM_SUBST_NOTMAKE(VARIABLE)
+# --------------------------
+# Public sister of _AM_SUBST_NOTMAKE.
+AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
+
+# Check how to create a tarball.                            -*- Autoconf -*-
+
+# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# _AM_PROG_TAR(FORMAT)
+# --------------------
+# Check how to create a tarball in format FORMAT.
+# FORMAT should be one of 'v7', 'ustar', or 'pax'.
+#
+# Substitute a variable $(am__tar) that is a command
+# writing to stdout a FORMAT-tarball containing the directory
+# $tardir.
+#     tardir=directory && $(am__tar) > result.tar
+#
+# Substitute a variable $(am__untar) that extract such
+# a tarball read from stdin.
+#     $(am__untar) < result.tar
+#
+AC_DEFUN([_AM_PROG_TAR],
+[# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AC_SUBST([AMTAR], ['$${TAR-tar}'])
+
+# We'll loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
+
+m4_if([$1], [v7],
+  [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
+
+  [m4_case([$1],
+    [ustar],
+     [# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format])
+      if test $am_uid -le $am_max_uid; then
+         AC_MSG_RESULT([yes])
+      else
+         AC_MSG_RESULT([no])
+         _am_tools=none
+      fi
+      AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format])
+      if test $am_gid -le $am_max_gid; then
+         AC_MSG_RESULT([yes])
+      else
+        AC_MSG_RESULT([no])
+        _am_tools=none
+      fi],
+
+  [pax],
+    [],
+
+  [m4_fatal([Unknown tar format])])
+
+  AC_MSG_CHECKING([how to create a $1 tar archive])
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_$1-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        AM_RUN_LOG([$_am_tar --version]) && break
+      done
+      am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x $1 -w "$$tardir"'
+      am__tar_='pax -L -x $1 -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+      am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+      am__untar='cpio -i -H $1 -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_$1}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      AM_RUN_LOG([$am__untar <conftest.tar])
+      AM_RUN_LOG([cat conftest.dir/file])
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
+  rm -rf conftest.dir
+
+  AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+  AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+
+AC_SUBST([am__tar])
+AC_SUBST([am__untar])
+]) # _AM_PROG_TAR
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..8261a2c
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1 @@
+aclocal && autoheader && automake --add-missing --gnu --copy && autoconf
diff --git a/blake.c b/blake.c
new file mode 100644
index 0000000..a9043e9
--- /dev/null
+++ b/blake.c
@@ -0,0 +1,1120 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_blake.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_SMALL_FOOTPRINT_BLAKE   1
+#endif
+
+#if SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_COMPACT_BLAKE_32   1
+#endif
+
+#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
+#define SPH_COMPACT_BLAKE_64   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507),
+	SPH_C32(0x3070DD17), SPH_C32(0xF70E5939),
+	SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 IV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#endif
+
+#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+
+static const unsigned sigma[16][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
+};
+
+/*
+  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+ 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
+ 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
+  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
+  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
+  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
+ 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
+ 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
+  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
+ 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
+*/
+#endif
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#if SPH_COMPACT_BLAKE_32
+
+static const sph_u32 CS[16] = {
+	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+};
+
+#endif
+
+#if SPH_64
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#if SPH_COMPACT_BLAKE_64
+
+static const sph_u64 CB[16] = {
+	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+};
+
+#endif
+
+#endif
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define ROUND_S(r)   do { \
+		GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
+		GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
+		GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
+		GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
+		GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
+		GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
+		GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
+		GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T64(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR64(d ^ a, 32); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 25); \
+		a = SPH_T64(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR64(d ^ a, 16); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 11); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define ROUND_B(r)   do { \
+		GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
+		GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
+		GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
+		GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
+		GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
+		GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
+		GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
+		GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_B(r)   do { \
+		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#endif
+
+#define DECL_STATE32 \
+	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u32 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE32(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE32(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define COMPRESS32   do { \
+		sph_u32 M[16]; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M[0x0] = sph_dec32be_aligned(buf +  0); \
+		M[0x1] = sph_dec32be_aligned(buf +  4); \
+		M[0x2] = sph_dec32be_aligned(buf +  8); \
+		M[0x3] = sph_dec32be_aligned(buf + 12); \
+		M[0x4] = sph_dec32be_aligned(buf + 16); \
+		M[0x5] = sph_dec32be_aligned(buf + 20); \
+		M[0x6] = sph_dec32be_aligned(buf + 24); \
+		M[0x7] = sph_dec32be_aligned(buf + 28); \
+		M[0x8] = sph_dec32be_aligned(buf + 32); \
+		M[0x9] = sph_dec32be_aligned(buf + 36); \
+		M[0xA] = sph_dec32be_aligned(buf + 40); \
+		M[0xB] = sph_dec32be_aligned(buf + 44); \
+		M[0xC] = sph_dec32be_aligned(buf + 48); \
+		M[0xD] = sph_dec32be_aligned(buf + 52); \
+		M[0xE] = sph_dec32be_aligned(buf + 56); \
+		M[0xF] = sph_dec32be_aligned(buf + 60); \
+		for (r = 0; r < 14; r ++) \
+			ROUND_S(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		ROUND_S(8); \
+		ROUND_S(9); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define DECL_STATE64 \
+	sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u64 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE64(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE64(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define COMPRESS64   do { \
+		sph_u64 M[16]; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M[0x0] = sph_dec64be_aligned(buf +   0); \
+		M[0x1] = sph_dec64be_aligned(buf +   8); \
+		M[0x2] = sph_dec64be_aligned(buf +  16); \
+		M[0x3] = sph_dec64be_aligned(buf +  24); \
+		M[0x4] = sph_dec64be_aligned(buf +  32); \
+		M[0x5] = sph_dec64be_aligned(buf +  40); \
+		M[0x6] = sph_dec64be_aligned(buf +  48); \
+		M[0x7] = sph_dec64be_aligned(buf +  56); \
+		M[0x8] = sph_dec64be_aligned(buf +  64); \
+		M[0x9] = sph_dec64be_aligned(buf +  72); \
+		M[0xA] = sph_dec64be_aligned(buf +  80); \
+		M[0xB] = sph_dec64be_aligned(buf +  88); \
+		M[0xC] = sph_dec64be_aligned(buf +  96); \
+		M[0xD] = sph_dec64be_aligned(buf + 104); \
+		M[0xE] = sph_dec64be_aligned(buf + 112); \
+		M[0xF] = sph_dec64be_aligned(buf + 120); \
+		for (r = 0; r < 16; r ++) \
+			ROUND_B(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS64   do { \
+		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M0 = sph_dec64be_aligned(buf +   0); \
+		M1 = sph_dec64be_aligned(buf +   8); \
+		M2 = sph_dec64be_aligned(buf +  16); \
+		M3 = sph_dec64be_aligned(buf +  24); \
+		M4 = sph_dec64be_aligned(buf +  32); \
+		M5 = sph_dec64be_aligned(buf +  40); \
+		M6 = sph_dec64be_aligned(buf +  48); \
+		M7 = sph_dec64be_aligned(buf +  56); \
+		M8 = sph_dec64be_aligned(buf +  64); \
+		M9 = sph_dec64be_aligned(buf +  72); \
+		MA = sph_dec64be_aligned(buf +  80); \
+		MB = sph_dec64be_aligned(buf +  88); \
+		MC = sph_dec64be_aligned(buf +  96); \
+		MD = sph_dec64be_aligned(buf + 104); \
+		ME = sph_dec64be_aligned(buf + 112); \
+		MF = sph_dec64be_aligned(buf + 120); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		ROUND_B(6); \
+		ROUND_B(7); \
+		ROUND_B(8); \
+		ROUND_B(9); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#endif
+
+static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+
+static void
+blake32_init(sph_blake_small_context *sc,
+	const sph_u32 *iv, const sph_u32 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake32(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake32_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf, 64);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
+#if SPH_64
+
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+
+static void
+blake64_init(sph_blake_big_context *sc,
+	const sph_u64 *iv, const sph_u64 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u64));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u64));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake64(sph_blake_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE64
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE64(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+				T1 = SPH_T64(T1 + 1);
+			COMPRESS64;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE64(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake64_close(sph_blake_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+	union {
+		unsigned char buf[128];
+		sph_u64 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u64 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+		sc->T1 = SPH_T64(sc->T1 - 1);
+	} else {
+		sc->T0 -= 1024 - bit_len;
+	}
+	if (bit_len <= 894) {
+		memset(u.buf + ptr + 1, 0, 111 - ptr);
+		if (out_size_w64 == 8)
+			u.buf[111] |= 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 127 - ptr);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+		memset(u.buf, 0, 112);
+		if (out_size_w64 == 8)
+			u.buf[111] = 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf, 128);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w64; k ++)
+		sph_enc64be(out + (k << 3), sc->H[k]);
+}
+
+#endif
+
+/* see sph_blake.h */
+void
+sph_blake224_init(void *cc)
+{
+	blake32_init(cc, IV224, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_close(void *cc, void *dst)
+{
+	sph_blake224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 7);
+	sph_blake224_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_init(void *cc)
+{
+	blake32_init(cc, IV256, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_close(void *cc, void *dst)
+{
+	sph_blake256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 8);
+	sph_blake256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_blake.h */
+void
+sph_blake384_init(void *cc)
+{
+	blake64_init(cc, IV384, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_close(void *cc, void *dst)
+{
+	sph_blake384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 6);
+	sph_blake384_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_init(void *cc)
+{
+	blake64_init(cc, IV512, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_close(void *cc, void *dst)
+{
+	sph_blake512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 8);
+	sph_blake512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ccminer.sln b/ccminer.sln
new file mode 100644
index 0000000..a28278f
--- /dev/null
+++ b/ccminer.sln
@@ -0,0 +1,26 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ccminer", "ccminer.vcxproj", "{36DC07F9-A4A6-4877-A146-1B960083CF6F}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.Build.0 = Debug|Win32
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.ActiveCfg = Debug|x64
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.Build.0 = Debug|x64
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.ActiveCfg = Release|Win32
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.Build.0 = Release|Win32
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.ActiveCfg = Release|x64
+		{36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
new file mode 100644
index 0000000..b721b6c
--- /dev/null
+++ b/ccminer.vcxproj
@@ -0,0 +1,291 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{36DC07F9-A4A6-4877-A146-1B960083CF6F}</ProjectGuid>
+    <RootNamespace>ccminer</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Debug;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CInterleavedPTX>true</CInterleavedPTX>
+    </CudaCompile>
+    <CudaCompile>
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
+      <Include>
+      </Include>
+      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Debug;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CInterleavedPTX>true</CInterleavedPTX>
+    </CudaCompile>
+    <CudaCompile>
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
+      <Include>
+      </Include>
+      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Release;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CInterleavedPTX>true</CInterleavedPTX>
+    </CudaCompile>
+    <CudaCompile>
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
+      <Include>
+      </Include>
+      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Release;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CInterleavedPTX>true</CInterleavedPTX>
+    </CudaCompile>
+    <CudaCompile>
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
+      <Include>
+      </Include>
+      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="blake.c" />
+    <ClCompile Include="compat\getopt\getopt_long.c" />
+    <ClCompile Include="compat\gettimeofday.c" />
+    <ClCompile Include="compat\jansson\dump.c" />
+    <ClCompile Include="compat\jansson\hashtable.c" />
+    <ClCompile Include="compat\jansson\load.c" />
+    <ClCompile Include="compat\jansson\strbuffer.c" />
+    <ClCompile Include="compat\jansson\utf.c" />
+    <ClCompile Include="compat\jansson\value.c" />
+    <ClCompile Include="cpu-miner.c">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+    <ClCompile Include="fugue.c" />
+    <ClCompile Include="fuguecoin.cpp" />
+    <ClCompile Include="groestl.c" />
+    <ClCompile Include="hefty1.c" />
+    <ClCompile Include="keccak.c" />
+    <ClCompile Include="scrypt.c" />
+    <ClCompile Include="sha2.c">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+    <ClCompile Include="util.c">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="compat.h" />
+    <ClInclude Include="compat\getopt\getopt.h" />
+    <ClInclude Include="compat\inttypes.h" />
+    <ClInclude Include="compat\stdbool.h" />
+    <ClInclude Include="compat\sys\time.h" />
+    <ClInclude Include="compat\unistd.h" />
+    <ClInclude Include="cpuminer-config.h" />
+    <ClInclude Include="cuda_blake512.h" />
+    <ClInclude Include="cuda_combine.h" />
+    <ClInclude Include="cuda_groestl512.h" />
+    <ClInclude Include="cuda_hefty1.h" />
+    <ClInclude Include="cuda_keccak512.h" />
+    <ClInclude Include="cuda_sha256.h" />
+    <ClInclude Include="elist.h" />
+    <ClInclude Include="hefty1.h" />
+    <ClInclude Include="miner.h" />
+    <ClInclude Include="sph_blake.h" />
+    <ClInclude Include="sph_fugue.h" />
+    <ClInclude Include="sph_groestl.h" />
+    <ClInclude Include="sph_keccak.h" />
+    <ClInclude Include="sph_types.h" />
+    <ClInclude Include="uint256.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="cuda_blake512.cu" />
+    <CudaCompile Include="cuda_combine.cu" />
+    <CudaCompile Include="cuda_fugue256.cu" />
+    <CudaCompile Include="cuda_groestl512.cu" />
+    <CudaCompile Include="cuda_hefty1.cu">
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="cuda_keccak512.cu" />
+    <CudaCompile Include="cuda_sha256.cu" />
+    <CudaCompile Include="heavy.cu" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
new file mode 100644
index 0000000..f3b22ce
--- /dev/null
+++ b/ccminer.vcxproj.filters
@@ -0,0 +1,188 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\gettimeofday">
+      <UniqueIdentifier>{c53ce808-c5c5-4c6c-99a2-3947090c62f1}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\getopt">
+      <UniqueIdentifier>{5a45c1bf-81d2-4bc6-97b5-714e34f51a82}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{431cec61-9376-4de9-aae9-04c4250652e7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat">
+      <UniqueIdentifier>{cc8bb259-5332-4a45-ba81-f4840a55b604}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat\getopt">
+      <UniqueIdentifier>{89362bd8-4690-4f0c-a4f7-6b2fa67a1f34}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat\sys">
+      <UniqueIdentifier>{6c3cd392-b6b8-424c-87d2-10e33dbd4b41}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA">
+      <UniqueIdentifier>{5a31b6f4-4943-4b22-b69a-230f3cc96269}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\CUDA">
+      <UniqueIdentifier>{a0f072d0-a831-4c23-8d64-7a026521df9c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\code">
+      <UniqueIdentifier>{fe39ded0-754b-415f-a284-038a15a0aa55}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\jansson">
+      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="compat\jansson\dump.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\hashtable.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\load.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\strbuffer.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\utf.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\value.c">
+      <Filter>Source Files\CUDA\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\getopt\getopt_long.c">
+      <Filter>Source Files\getopt</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\gettimeofday.c">
+      <Filter>Source Files\gettimeofday</Filter>
+    </ClCompile>
+    <ClCompile Include="util.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sha2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="cpu-miner.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="blake.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="groestl.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="hefty1.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="keccak.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="scrypt.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="fugue.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="fuguecoin.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="compat.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="elist.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="miner.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\sys\time.h">
+      <Filter>Header Files\compat\sys</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\unistd.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\inttypes.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\getopt\getopt.h">
+      <Filter>Header Files\compat\getopt</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\stdbool.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="cpuminer-config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph_blake.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph_types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph_groestl.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sph_keccak.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="hefty1.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_sha256.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_hefty1.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_keccak512.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_combine.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_blake512.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_groestl512.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="sph_fugue.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="uint256.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="cuda_sha256.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_blake512.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_groestl512.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_hefty1.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_keccak512.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_combine.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_fugue256.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/compat.h b/compat.h
new file mode 100644
index 0000000..ac7b8b9
--- /dev/null
+++ b/compat.h
@@ -0,0 +1,24 @@
+#ifndef __COMPAT_H__
+#define __COMPAT_H__
+
+#ifdef WIN32
+
+#include <windows.h>
+
+static __inline void sleep(int secs)
+{
+	Sleep(secs * 1000);
+}
+
+enum {
+	PRIO_PROCESS		= 0,
+};
+
+static __inline int setpriority(int which, int who, int prio)
+{
+	return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE /*THREAD_PRIORITY_TIME_CRITICAL*/);
+}
+
+#endif /* WIN32 */
+
+#endif /* __COMPAT_H__ */
diff --git a/compat/Makefile.am b/compat/Makefile.am
new file mode 100644
index 0000000..9401c8e
--- /dev/null
+++ b/compat/Makefile.am
@@ -0,0 +1,7 @@
+
+if WANT_JANSSON
+SUBDIRS	= jansson
+else
+SUBDIRS	=
+endif
+
diff --git a/compat/Makefile.in b/compat/Makefile.in
new file mode 100644
index 0000000..d1d76d9
--- /dev/null
+++ b/compat/Makefile.in
@@ -0,0 +1,601 @@
+# Makefile.in generated by automake 1.13.3 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = compat
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/cpuminer-config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+	ctags-recursive dvi-recursive html-recursive info-recursive \
+	install-data-recursive install-dvi-recursive \
+	install-exec-recursive install-html-recursive \
+	install-info-recursive install-pdf-recursive \
+	install-ps-recursive install-recursive installcheck-recursive \
+	installdirs-recursive pdf-recursive ps-recursive \
+	tags-recursive uninstall-recursive
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+	distdir
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = jansson
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CUDA_CFLAGS = @CUDA_CFLAGS@
+CUDA_LDFLAGS = @CUDA_LDFLAGS@
+CUDA_LIBS = @CUDA_LIBS@
+CXX = @CXX@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+JANSSON_LIBS = @JANSSON_LIBS@
+LDFLAGS = @LDFLAGS@
+LIBCURL = @LIBCURL@
+LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+NVCC = @NVCC@
+OBJEXT = @OBJEXT@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PTHREAD_FLAGS = @PTHREAD_FLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WS2_LIBS = @WS2_LIBS@
+_libcurl_config = @_libcurl_config@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+@WANT_JANSSON_FALSE@SUBDIRS = 
+@WANT_JANSSON_TRUE@SUBDIRS = jansson
+all: all-recursive
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu compat/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu compat/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+#     (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+	@fail=; \
+	if $(am__make_keepgoing); then \
+	  failcom='fail=yes'; \
+	else \
+	  failcom='exit 1'; \
+	fi; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-recursive
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-recursive
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-recursive
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+html-am:
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-html-am:
+
+install-info: install-info-recursive
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-pdf-am:
+
+install-ps: install-ps-recursive
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-generic
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(am__recursive_targets) install-am install-strip
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
+	check-am clean clean-generic cscopelist-am ctags ctags-am \
+	distclean distclean-generic distclean-tags distdir dvi dvi-am \
+	html html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
+	pdf-am ps ps-am tags tags-am uninstall uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/compat/getopt/getopt.h b/compat/getopt/getopt.h
new file mode 100644
index 0000000..068cc24
--- /dev/null
+++ b/compat/getopt/getopt.h
@@ -0,0 +1,93 @@
+/*	$Id: getopt.h,v 1.1 2009/10/16 19:50:28 rodney Exp rodney $ */
+/*	$OpenBSD: getopt.h,v 1.1 2002/12/03 20:24:29 millert Exp $	*/
+/*	$NetBSD: getopt.h,v 1.4 2000/07/07 10:43:54 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GETOPT_H_
+#define _GETOPT_H_
+
+#if 0
+#include <sys/cdefs.h>
+#endif
+
+/*
+ * GNU-like getopt_long() and 4.4BSD getsubopt()/optreset extensions
+ */
+#define no_argument        0
+#define required_argument  1
+#define optional_argument  2
+
+struct option {
+	/* name of long option */
+	const char *name;
+	/*
+	 * one of no_argument, required_argument, and optional_argument:
+	 * whether option takes an argument
+	 */
+	int has_arg;
+	/* if not NULL, set *flag to val when option found */
+	int *flag;
+	/* if flag not NULL, value to set *flag to; else return value */
+	int val;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int	 getopt_long(int, char * const *, const char *,
+	    const struct option *, int *);
+int	 getopt_long_only(int, char * const *, const char *,
+	    const struct option *, int *);
+#ifndef _GETOPT_DEFINED
+#define _GETOPT_DEFINED
+int	 getopt(int, char * const *, const char *);
+int	 getsubopt(char **, char * const *, char **);
+
+extern   char *optarg;                  /* getopt(3) external variables */
+extern   int opterr;
+extern   int optind;
+extern   int optopt;
+extern   int optreset;
+extern   char *suboptarg;               /* getsubopt(3) external variable */
+#endif /* _GETOPT_DEFINED */
+ 
+#ifdef __cplusplus
+}
+#endif
+#endif /* !_GETOPT_H_ */
diff --git a/compat/getopt/getopt_long.c b/compat/getopt/getopt_long.c
new file mode 100644
index 0000000..90fc0cf
--- /dev/null
+++ b/compat/getopt/getopt_long.c
@@ -0,0 +1,554 @@
+/*	$Id: getopt_long.c,v 1.1 2009/10/16 19:50:28 rodney Exp rodney $	*/
+/*	$OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $	*/
+/*	$NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $	*/
+
+/*
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+
+#ifndef lint
+static const char rcsid[]="$Id: getopt_long.c,v 1.1 2009/10/16 19:50:28 rodney Exp rodney $";
+#endif /* lint */
+/*-
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if 0
+#include <err.h>
+#endif
+#include <errno.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+
+/* Windows needs warnx().  We change the definition though:
+ *  1. (another) global is defined, opterrmsg, which holds the error message
+ *  2. errors are always printed out on stderr w/o the program name
+ * Note that opterrmsg always gets set no matter what opterr is set to.  The
+ * error message will not be printed if opterr is 0 as usual.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdarg.h>
+
+char opterrmsg[128]; /* last error message is stored here */
+
+static void warnx(const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	if (fmt != NULL)
+		_vsnprintf(opterrmsg, 128, fmt, ap);
+	else
+		opterrmsg[0]='\0';
+	va_end(ap);
+	fprintf(stderr, opterrmsg);
+	fprintf(stderr, "\n");
+}
+
+#endif /*_WIN32*/
+
+#define	REPLACE_GETOPT		/* use this getopt as the system getopt(3) */
+
+#ifdef REPLACE_GETOPT
+int	opterr = 1;		/* if error message should be printed */
+int	optind = 1;		/* index into parent argv vector */
+int	optopt = '?';		/* character checked for validity */
+int	optreset;		/* reset getopt */
+char    *optarg;		/* argument associated with option */
+#endif
+
+#define PRINT_ERROR	((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE	0x01	/* permute non-options to the end of argv */
+#define FLAG_ALLARGS	0x02	/* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY	0x04	/* operate as getopt_long_only */
+
+/* return values */
+#define	BADCH		(int)'?'
+#define	BADARG		((*options == ':') ? (int)':' : (int)'?')
+#define	INORDER 	(int)1
+
+#define	EMSG		""
+
+static int getopt_internal(int, char * const *, const char *,
+			   const struct option *, int *, int);
+static int parse_long_options(char * const *, const char *,
+			      const struct option *, int *, int);
+static int gcd(int, int);
+static void permute_args(int, int, int, char * const *);
+
+static char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1;   /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(int a, int b)
+{
+	int c;
+
+	c = a % b;
+	while (c != 0) {
+		a = b;
+		b = c;
+		c = a % b;
+	}
+
+	return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(int panonopt_start, int panonopt_end, int opt_end,
+	char * const *nargv)
+{
+	int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+	char *swap;
+
+	/*
+	 * compute lengths of blocks and number and size of cycles
+	 */
+	nnonopts = panonopt_end - panonopt_start;
+	nopts = opt_end - panonopt_end;
+	ncycle = gcd(nnonopts, nopts);
+	cyclelen = (opt_end - panonopt_start) / ncycle;
+
+	for (i = 0; i < ncycle; i++) {
+		cstart = panonopt_end+i;
+		pos = cstart;
+		for (j = 0; j < cyclelen; j++) {
+			if (pos >= panonopt_end)
+				pos -= nnonopts;
+			else
+				pos += nopts;
+			swap = nargv[pos];
+			/* LINTED const cast */
+			((char **) nargv)[pos] = nargv[cstart];
+			/* LINTED const cast */
+			((char **)nargv)[cstart] = swap;
+		}
+	}
+}
+
+/*
+ * parse_long_options --
+ *	Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int
+parse_long_options(char * const *nargv, const char *options,
+	const struct option *long_options, int *idx, int short_too)
+{
+	char *current_argv, *has_equal;
+	size_t current_argv_len;
+	int i, match;
+
+	current_argv = place;
+	match = -1;
+
+	optind++;
+
+	if ((has_equal = strchr(current_argv, '=')) != NULL) {
+		/* argument found (--option=arg) */
+		current_argv_len = has_equal - current_argv;
+		has_equal++;
+	} else
+		current_argv_len = strlen(current_argv);
+
+	for (i = 0; long_options[i].name; i++) {
+		/* find matching long option */
+		if (strncmp(current_argv, long_options[i].name,
+		    current_argv_len))
+			continue;
+
+		if (strlen(long_options[i].name) == current_argv_len) {
+			/* exact match */
+			match = i;
+			break;
+		}
+		/*
+		 * If this is a known short option, don't allow
+		 * a partial match of a single character.
+		 */
+		if (short_too && current_argv_len == 1)
+			continue;
+
+		if (match == -1)	/* partial match */
+			match = i;
+		else {
+			/* ambiguous abbreviation */
+			if (PRINT_ERROR)
+				warnx(ambig, (int)current_argv_len,
+				     current_argv);
+			optopt = 0;
+			return (BADCH);
+		}
+	}
+	if (match != -1) {		/* option found */
+		if (long_options[match].has_arg == no_argument
+		    && has_equal) {
+			if (PRINT_ERROR)
+				warnx(noarg, (int)current_argv_len,
+				     current_argv);
+			/*
+			 * XXX: GNU sets optopt to val regardless of flag
+			 */
+			if (long_options[match].flag == NULL)
+				optopt = long_options[match].val;
+			else
+				optopt = 0;
+			return (BADARG);
+		}
+		if (long_options[match].has_arg == required_argument ||
+		    long_options[match].has_arg == optional_argument) {
+			if (has_equal)
+				optarg = has_equal;
+			else if (long_options[match].has_arg ==
+			    required_argument) {
+				/*
+				 * optional argument doesn't use next nargv
+				 */
+				optarg = nargv[optind++];
+			}
+		}
+		if ((long_options[match].has_arg == required_argument)
+		    && (optarg == NULL)) {
+			/*
+			 * Missing argument; leading ':' indicates no error
+			 * should be generated.
+			 */
+			if (PRINT_ERROR)
+				warnx(recargstring,
+				    current_argv);
+			/*
+			 * XXX: GNU sets optopt to val regardless of flag
+			 */
+			if (long_options[match].flag == NULL)
+				optopt = long_options[match].val;
+			else
+				optopt = 0;
+			--optind;
+			return (BADARG);
+		}
+	} else {			/* unknown option */
+		if (short_too) {
+			--optind;
+			return (-1);
+		}
+		if (PRINT_ERROR)
+			warnx(illoptstring, current_argv);
+		optopt = 0;
+		return (BADCH);
+	}
+	if (idx)
+		*idx = match;
+	if (long_options[match].flag) {
+		*long_options[match].flag = long_options[match].val;
+		return (0);
+	} else
+		return (long_options[match].val);
+}
+
+/*
+ * getopt_internal --
+ *	Parse argc/argv argument vector.  Called by user level routines.
+ */
+static int
+getopt_internal(int nargc, char * const *nargv, const char *options,
+	const struct option *long_options, int *idx, int flags)
+{
+	char *oli;				/* option letter list index */
+	int optchar, short_too;
+	static int posixly_correct = -1;
+
+	if (options == NULL)
+		return (-1);
+
+	/*
+	 * Disable GNU extensions if POSIXLY_CORRECT is set or options
+	 * string begins with a '+'.
+	 */
+	if (posixly_correct == -1)
+		posixly_correct = (getenv("POSIXLY_CORRECT") != NULL);
+	if (posixly_correct || *options == '+')
+		flags &= ~FLAG_PERMUTE;
+	else if (*options == '-')
+		flags |= FLAG_ALLARGS;
+	if (*options == '+' || *options == '-')
+		options++;
+
+	/*
+	 * XXX Some GNU programs (like cvs) set optind to 0 instead of
+	 * XXX using optreset.  Work around this braindamage.
+	 */
+	if (optind == 0)
+		optind = optreset = 1;
+
+	optarg = NULL;
+	if (optreset)
+		nonopt_start = nonopt_end = -1;
+start:
+	if (optreset || !*place) {		/* update scanning pointer */
+		optreset = 0;
+		if (optind >= nargc) {          /* end of argument vector */
+			place = EMSG;
+			if (nonopt_end != -1) {
+				/* do permutation, if we have to */
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			else if (nonopt_start != -1) {
+				/*
+				 * If we skipped non-options, set optind
+				 * to the first of them.
+				 */
+				optind = nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return (-1);
+		}
+		if (*(place = nargv[optind]) != '-' ||
+		    (place[1] == '\0' && strchr(options, '-') == NULL)) {
+			place = EMSG;		/* found non-option */
+			if (flags & FLAG_ALLARGS) {
+				/*
+				 * GNU extension:
+				 * return non-option as argument to option 1
+				 */
+				optarg = nargv[optind++];
+				return (INORDER);
+			}
+			if (!(flags & FLAG_PERMUTE)) {
+				/*
+				 * If no permutation wanted, stop parsing
+				 * at first non-option.
+				 */
+				return (-1);
+			}
+			/* do permutation */
+			if (nonopt_start == -1)
+				nonopt_start = optind;
+			else if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				nonopt_start = optind -
+				    (nonopt_end - nonopt_start);
+				nonopt_end = -1;
+			}
+			optind++;
+			/* process next argument */
+			goto start;
+		}
+		if (nonopt_start != -1 && nonopt_end == -1)
+			nonopt_end = optind;
+
+		/*
+		 * If we have "-" do nothing, if "--" we are done.
+		 */
+		if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
+			optind++;
+			place = EMSG;
+			/*
+			 * We found an option (--), so if we skipped
+			 * non-options, we have to permute.
+			 */
+			if (nonopt_end != -1) {
+				permute_args(nonopt_start, nonopt_end,
+				    optind, nargv);
+				optind -= nonopt_end - nonopt_start;
+			}
+			nonopt_start = nonopt_end = -1;
+			return (-1);
+		}
+	}
+
+	/*
+	 * Check long options if:
+	 *  1) we were passed some
+	 *  2) the arg is not just "-"
+	 *  3) either the arg starts with -- we are getopt_long_only()
+	 */
+	if (long_options != NULL && place != nargv[optind] &&
+	    (*place == '-' || (flags & FLAG_LONGONLY))) {
+		short_too = 0;
+		if (*place == '-')
+			place++;		/* --foo long option */
+		else if (*place != ':' && strchr(options, *place) != NULL)
+			short_too = 1;		/* could be short option too */
+
+		optchar = parse_long_options(nargv, options, long_options,
+		    idx, short_too);
+		if (optchar != -1) {
+			place = EMSG;
+			return (optchar);
+		}
+	}
+
+	if ((optchar = (int)*place++) == (int)':' ||
+	    (optchar == (int)'-' && *place != '\0') ||
+	    (oli = strchr(options, optchar)) == NULL) {
+		/*
+		 * If the user specified "-" and  '-' isn't listed in
+		 * options, return -1 (non-option) as per POSIX.
+		 * Otherwise, it is an unknown option character (or ':').
+		 */
+		if (optchar == (int)'-' && *place == '\0')
+			return (-1);
+		if (!*place)
+			++optind;
+		if (PRINT_ERROR)
+			warnx(illoptchar, optchar);
+		optopt = optchar;
+		return (BADCH);
+	}
+	if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
+		/* -W long-option */
+		if (*place)			/* no space */
+			/* NOTHING */;
+		else if (++optind >= nargc) {	/* no arg */
+			place = EMSG;
+			if (PRINT_ERROR)
+				warnx(recargchar, optchar);
+			optopt = optchar;
+			return (BADARG);
+		} else				/* white space */
+			place = nargv[optind];
+		optchar = parse_long_options(nargv, options, long_options,
+		    idx, 0);
+		place = EMSG;
+		return (optchar);
+	}
+	if (*++oli != ':') {			/* doesn't take argument */
+		if (!*place)
+			++optind;
+	} else {				/* takes (optional) argument */
+		optarg = NULL;
+		if (*place)			/* no white space */
+			optarg = place;
+		else if (oli[1] != ':') {	/* arg not optional */
+			if (++optind >= nargc) {	/* no arg */
+				place = EMSG;
+				if (PRINT_ERROR)
+					warnx(recargchar, optchar);
+				optopt = optchar;
+				return (BADARG);
+			} else
+				optarg = nargv[optind];
+		}
+		place = EMSG;
+		++optind;
+	}
+	/* dump back option letter */
+	return (optchar);
+}
+
+#ifdef REPLACE_GETOPT
+/*
+ * getopt --
+ *	Parse argc/argv argument vector.
+ *
+ * [eventually this will replace the BSD getopt]
+ */
+int
+getopt(int nargc, char * const *nargv, const char *options)
+{
+
+	/*
+	 * We don't pass FLAG_PERMUTE to getopt_internal() since
+	 * the BSD getopt(3) (unlike GNU) has never done this.
+	 *
+	 * Furthermore, since many privileged programs call getopt()
+	 * before dropping privileges it makes sense to keep things
+	 * as simple (and bug-free) as possible.
+	 */
+	return (getopt_internal(nargc, nargv, options, NULL, NULL, 0));
+}
+#endif /* REPLACE_GETOPT */
+
+/*
+ * getopt_long --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx)
+{
+
+	return (getopt_internal(nargc, nargv, options, long_options, idx,
+	    FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ *	Parse argc/argv argument vector.
+ */
+int
+getopt_long_only(int nargc, char * const *nargv, const char *options,
+    const struct option *long_options, int *idx)
+{
+
+	return (getopt_internal(nargc, nargv, options, long_options, idx,
+	    FLAG_PERMUTE|FLAG_LONGONLY));
+}
diff --git a/compat/gettimeofday.c b/compat/gettimeofday.c
new file mode 100644
index 0000000..da17893
--- /dev/null
+++ b/compat/gettimeofday.c
@@ -0,0 +1,83 @@
+#include < time.h >
+#include <windows.h> //I've ommited this line.
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
+#else
+  #define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
+#endif
+ 
+struct timezone 
+{
+  int  tz_minuteswest; /* minutes W of Greenwich */
+  int  tz_dsttime;     /* type of dst correction */
+};
+ 
+int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+ 
+  if (NULL != tv)
+  {
+    GetSystemTimeAsFileTime(&ft);
+ 
+    tmpres |= ft.dwHighDateTime;
+    tmpres <<= 32;
+    tmpres |= ft.dwLowDateTime;
+ 
+    /*converting file time to unix epoch*/
+    tmpres /= 10;  /*convert into microseconds*/
+    tmpres -= DELTA_EPOCH_IN_MICROSECS; 
+    tv->tv_sec = (long)(tmpres / 1000000UL);
+    tv->tv_usec = (long)(tmpres % 1000000UL);
+  }
+ 
+  if (NULL != tz)
+  {
+    if (!tzflag)
+    {
+      _tzset();
+      tzflag++;
+    }
+    tz->tz_minuteswest = _timezone / 60;
+    tz->tz_dsttime = _daylight;
+  }
+ 
+  return 0;
+}
+
+void usleep(__int64 waitTime) 
+{ 
+    if (waitTime > 0)
+    {
+        if (waitTime > 100)
+        {
+            // use a waitable timer for larger intervals > 0.1ms
+
+            HANDLE timer; 
+            LARGE_INTEGER ft; 
+
+            ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
+
+            timer = CreateWaitableTimer(NULL, TRUE, NULL); 
+            SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); 
+            WaitForSingleObject(timer, INFINITE); 
+            CloseHandle(timer); 
+        }
+        else
+        {
+            // use a polling loop for short intervals <= 100ms
+
+            LARGE_INTEGER perfCnt, start, now;
+            __int64 elapsed;
+ 
+            QueryPerformanceFrequency(&perfCnt);
+            QueryPerformanceCounter(&start);
+            do {
+                QueryPerformanceCounter((LARGE_INTEGER*) &now);
+                elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
+            } while ( elapsed < waitTime );
+        }
+    }
+}
diff --git a/compat/inttypes.h b/compat/inttypes.h
new file mode 100644
index 0000000..f07d50f
--- /dev/null
+++ b/compat/inttypes.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <stdint.h>
diff --git a/compat/jansson/Makefile.am b/compat/jansson/Makefile.am
new file mode 100644
index 0000000..ff38e51
--- /dev/null
+++ b/compat/jansson/Makefile.am
@@ -0,0 +1,18 @@
+
+noinst_LIBRARIES	= libjansson.a
+
+libjansson_a_SOURCES	= \
+			  config.h		\
+			  dump.c		\
+			  hashtable.c		\
+			  hashtable.h		\
+			  jansson.h		\
+			  jansson_private.h	\
+			  load.c		\
+			  strbuffer.c		\
+			  strbuffer.h		\
+			  utf.c			\
+			  utf.h			\
+			  util.h		\
+			  value.c
+
diff --git a/compat/jansson/Makefile.in b/compat/jansson/Makefile.in
new file mode 100644
index 0000000..f5e30ac
--- /dev/null
+++ b/compat/jansson/Makefile.in
@@ -0,0 +1,571 @@
+# Makefile.in generated by automake 1.13.3 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = compat/jansson
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/depcomp
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/cpuminer-config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LIBRARIES = $(noinst_LIBRARIES)
+AR = ar
+ARFLAGS = cru
+AM_V_AR = $(am__v_AR_@AM_V@)
+am__v_AR_ = $(am__v_AR_@AM_DEFAULT_V@)
+am__v_AR_0 = @echo "  AR      " $@;
+am__v_AR_1 = 
+libjansson_a_AR = $(AR) $(ARFLAGS)
+libjansson_a_LIBADD =
+am_libjansson_a_OBJECTS = dump.$(OBJEXT) hashtable.$(OBJEXT) \
+	load.$(OBJEXT) strbuffer.$(OBJEXT) utf.$(OBJEXT) \
+	value.$(OBJEXT)
+libjansson_a_OBJECTS = $(am_libjansson_a_OBJECTS)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libjansson_a_SOURCES)
+DIST_SOURCES = $(libjansson_a_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CUDA_CFLAGS = @CUDA_CFLAGS@
+CUDA_LDFLAGS = @CUDA_LDFLAGS@
+CUDA_LIBS = @CUDA_LIBS@
+CXX = @CXX@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+JANSSON_LIBS = @JANSSON_LIBS@
+LDFLAGS = @LDFLAGS@
+LIBCURL = @LIBCURL@
+LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+NVCC = @NVCC@
+OBJEXT = @OBJEXT@
+OPENMP_CFLAGS = @OPENMP_CFLAGS@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PTHREAD_FLAGS = @PTHREAD_FLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+WS2_LIBS = @WS2_LIBS@
+_libcurl_config = @_libcurl_config@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+noinst_LIBRARIES = libjansson.a
+libjansson_a_SOURCES = \
+			  config.h		\
+			  dump.c		\
+			  hashtable.c		\
+			  hashtable.h		\
+			  jansson.h		\
+			  jansson_private.h	\
+			  load.c		\
+			  strbuffer.c		\
+			  strbuffer.h		\
+			  utf.c			\
+			  utf.h			\
+			  util.h		\
+			  value.c
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu compat/jansson/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu compat/jansson/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLIBRARIES:
+	-test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES)
+
+libjansson.a: $(libjansson_a_OBJECTS) $(libjansson_a_DEPENDENCIES) $(EXTRA_libjansson_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libjansson.a
+	$(AM_V_AR)$(libjansson_a_AR) libjansson.a $(libjansson_a_OBJECTS) $(libjansson_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libjansson.a
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dump.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hashtable.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/load.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strbuffer.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/value.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-noinstLIBRARIES mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-noinstLIBRARIES cscopelist-am ctags ctags-am distclean \
+	distclean-compile distclean-generic distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic pdf pdf-am ps ps-am tags tags-am uninstall \
+	uninstall-am
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/compat/jansson/config.h b/compat/jansson/config.h
new file mode 100644
index 0000000..f11075a
--- /dev/null
+++ b/compat/jansson/config.h
@@ -0,0 +1,73 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "jansson"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "petri@digip.org"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "jansson"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "jansson 1.3"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "jansson"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.3"
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.3"
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef int32_t */
diff --git a/compat/jansson/dump.c b/compat/jansson/dump.c
new file mode 100644
index 0000000..a8c9cc6
--- /dev/null
+++ b/compat/jansson/dump.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <jansson.h>
+#include "jansson_private.h"
+#include "strbuffer.h"
+#include "utf.h"
+
+#define MAX_INTEGER_STR_LENGTH  100
+#define MAX_REAL_STR_LENGTH     100
+
+typedef int (*dump_func)(const char *buffer, int size, void *data);
+
+struct string
+{
+    char *buffer;
+    int length;
+    int size;
+};
+
+static int dump_to_strbuffer(const char *buffer, int size, void *data)
+{
+    return strbuffer_append_bytes((strbuffer_t *)data, buffer, size);
+}
+
+static int dump_to_file(const char *buffer, int size, void *data)
+{
+    FILE *dest = (FILE *)data;
+    if(fwrite(buffer, size, 1, dest) != 1)
+        return -1;
+    return 0;
+}
+
+/* 256 spaces (the maximum indentation size) */
+static char whitespace[] = "                                                                                                                                                                                                                                                                ";
+
+static int dump_indent(unsigned long flags, int depth, int space, dump_func dump, void *data)
+{
+    if(JSON_INDENT(flags) > 0)
+    {
+        int i, ws_count = JSON_INDENT(flags);
+
+        if(dump("\n", 1, data))
+            return -1;
+
+        for(i = 0; i < depth; i++)
+        {
+            if(dump(whitespace, ws_count, data))
+                return -1;
+        }
+    }
+    else if(space && !(flags & JSON_COMPACT))
+    {
+        return dump(" ", 1, data);
+    }
+    return 0;
+}
+
+static int dump_string(const char *str, int ascii, dump_func dump, void *data)
+{
+    const char *pos, *end;
+    int32_t codepoint;
+
+    if(dump("\"", 1, data))
+        return -1;
+
+    end = pos = str;
+    while(1)
+    {
+        const char *text;
+        char seq[13];
+        int length;
+
+        while(*end)
+        {
+            end = utf8_iterate(pos, &codepoint);
+            if(!end)
+                return -1;
+
+            /* mandatory escape or control char */
+            if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20)
+                break;
+
+            /* non-ASCII */
+            if(ascii && codepoint > 0x7F)
+                break;
+
+            pos = end;
+        }
+
+        if(pos != str) {
+            if(dump(str, pos - str, data))
+                return -1;
+        }
+
+        if(end == pos)
+            break;
+
+        /* handle \, ", and control codes */
+        length = 2;
+        switch(codepoint)
+        {
+            case '\\': text = "\\\\"; break;
+            case '\"': text = "\\\""; break;
+            case '\b': text = "\\b"; break;
+            case '\f': text = "\\f"; break;
+            case '\n': text = "\\n"; break;
+            case '\r': text = "\\r"; break;
+            case '\t': text = "\\t"; break;
+            default:
+            {
+                /* codepoint is in BMP */
+                if(codepoint < 0x10000)
+                {
+                    sprintf(seq, "\\u%04x", codepoint);
+                    length = 6;
+                }
+
+                /* not in BMP -> construct a UTF-16 surrogate pair */
+                else
+                {
+                    int32_t first, last;
+
+                    codepoint -= 0x10000;
+                    first = 0xD800 | ((codepoint & 0xffc00) >> 10);
+                    last = 0xDC00 | (codepoint & 0x003ff);
+
+                    sprintf(seq, "\\u%04x\\u%04x", first, last);
+                    length = 12;
+                }
+
+                text = seq;
+                break;
+            }
+        }
+
+        if(dump(text, length, data))
+            return -1;
+
+        str = pos = end;
+    }
+
+    return dump("\"", 1, data);
+}
+
+static int object_key_compare_keys(const void *key1, const void *key2)
+{
+    return strcmp((*(const object_key_t **)key1)->key,
+                  (*(const object_key_t **)key2)->key);
+}
+
+static int object_key_compare_serials(const void *key1, const void *key2)
+{
+    return (*(const object_key_t **)key1)->serial -
+           (*(const object_key_t **)key2)->serial;
+}
+
+static int do_dump(const json_t *json, unsigned long flags, int depth,
+                   dump_func dump, void *data)
+{
+    int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0;
+
+    switch(json_typeof(json)) {
+        case JSON_NULL:
+            return dump("null", 4, data);
+
+        case JSON_TRUE:
+            return dump("true", 4, data);
+
+        case JSON_FALSE:
+            return dump("false", 5, data);
+
+        case JSON_INTEGER:
+        {
+            char buffer[MAX_INTEGER_STR_LENGTH];
+            int size;
+
+            size = snprintf(buffer, MAX_INTEGER_STR_LENGTH, "%d", json_integer_value(json));
+            if(size >= MAX_INTEGER_STR_LENGTH)
+                return -1;
+
+            return dump(buffer, size, data);
+        }
+
+        case JSON_REAL:
+        {
+            char buffer[MAX_REAL_STR_LENGTH];
+            int size;
+
+            size = snprintf(buffer, MAX_REAL_STR_LENGTH, "%.17g",
+                            json_real_value(json));
+            if(size >= MAX_REAL_STR_LENGTH)
+                return -1;
+
+            /* Make sure there's a dot or 'e' in the output. Otherwise
+               a real is converted to an integer when decoding */
+            if(strchr(buffer, '.') == NULL &&
+               strchr(buffer, 'e') == NULL)
+            {
+                if(size + 2 >= MAX_REAL_STR_LENGTH) {
+                    /* No space to append ".0" */
+                    return -1;
+                }
+                buffer[size] = '.';
+                buffer[size + 1] = '0';
+                size += 2;
+            }
+
+            return dump(buffer, size, data);
+        }
+
+        case JSON_STRING:
+            return dump_string(json_string_value(json), ascii, dump, data);
+
+        case JSON_ARRAY:
+        {
+            int i;
+            int n;
+            json_array_t *array;
+
+            /* detect circular references */
+            array = json_to_array(json);
+            if(array->visited)
+                goto array_error;
+            array->visited = 1;
+
+            n = json_array_size(json);
+
+            if(dump("[", 1, data))
+                goto array_error;
+            if(n == 0) {
+                array->visited = 0;
+                return dump("]", 1, data);
+            }
+            if(dump_indent(flags, depth + 1, 0, dump, data))
+                goto array_error;
+
+            for(i = 0; i < n; ++i) {
+                if(do_dump(json_array_get(json, i), flags, depth + 1,
+                           dump, data))
+                    goto array_error;
+
+                if(i < n - 1)
+                {
+                    if(dump(",", 1, data) ||
+                       dump_indent(flags, depth + 1, 1, dump, data))
+                        goto array_error;
+                }
+                else
+                {
+                    if(dump_indent(flags, depth, 0, dump, data))
+                        goto array_error;
+                }
+            }
+
+            array->visited = 0;
+            return dump("]", 1, data);
+
+        array_error:
+            array->visited = 0;
+            return -1;
+        }
+
+        case JSON_OBJECT:
+        {
+            json_object_t *object;
+            void *iter;
+            const char *separator;
+            int separator_length;
+
+            if(flags & JSON_COMPACT) {
+                separator = ":";
+                separator_length = 1;
+            }
+            else {
+                separator = ": ";
+                separator_length = 2;
+            }
+
+            /* detect circular references */
+            object = json_to_object(json);
+            if(object->visited)
+                goto object_error;
+            object->visited = 1;
+
+            iter = json_object_iter((json_t *)json);
+
+            if(dump("{", 1, data))
+                goto object_error;
+            if(!iter) {
+                object->visited = 0;
+                return dump("}", 1, data);
+            }
+            if(dump_indent(flags, depth + 1, 0, dump, data))
+                goto object_error;
+
+            if(flags & JSON_SORT_KEYS || flags & JSON_PRESERVE_ORDER)
+            {
+                const object_key_t **keys;
+                unsigned int size;
+                unsigned int i;
+                int (*cmp_func)(const void *, const void *);
+
+                size = json_object_size(json);
+                keys = malloc(size * sizeof(object_key_t *));
+                if(!keys)
+                    goto object_error;
+
+                i = 0;
+                while(iter)
+                {
+                    keys[i] = jsonp_object_iter_fullkey(iter);
+                    iter = json_object_iter_next((json_t *)json, iter);
+                    i++;
+                }
+                assert(i == size);
+
+                if(flags & JSON_SORT_KEYS)
+                    cmp_func = object_key_compare_keys;
+                else
+                    cmp_func = object_key_compare_serials;
+
+                qsort((void*)keys, size, sizeof(object_key_t *), cmp_func);
+
+                for(i = 0; i < size; i++)
+                {
+                    const char *key;
+                    json_t *value;
+
+                    key = keys[i]->key;
+                    value = json_object_get(json, key);
+                    assert(value);
+
+                    dump_string(key, ascii, dump, data);
+                    if(dump(separator, separator_length, data) ||
+                       do_dump(value, flags, depth + 1, dump, data))
+                    {
+                        free((void*)keys);
+                        goto object_error;
+                    }
+
+                    if(i < size - 1)
+                    {
+                        if(dump(",", 1, data) ||
+                           dump_indent(flags, depth + 1, 1, dump, data))
+                        {
+                            free((void*)keys);
+                            goto object_error;
+                        }
+                    }
+                    else
+                    {
+                        if(dump_indent(flags, depth, 0, dump, data))
+                        {
+                            free((void*)keys);
+                            goto object_error;
+                        }
+                    }
+                }
+
+                free((void*)keys);
+            }
+            else
+            {
+                /* Don't sort keys */
+
+                while(iter)
+                {
+                    void *next = json_object_iter_next((json_t *)json, iter);
+
+                    dump_string(json_object_iter_key(iter), ascii, dump, data);
+                    if(dump(separator, separator_length, data) ||
+                       do_dump(json_object_iter_value(iter), flags, depth + 1,
+                               dump, data))
+                        goto object_error;
+
+                    if(next)
+                    {
+                        if(dump(",", 1, data) ||
+                           dump_indent(flags, depth + 1, 1, dump, data))
+                            goto object_error;
+                    }
+                    else
+                    {
+                        if(dump_indent(flags, depth, 0, dump, data))
+                            goto object_error;
+                    }
+
+                    iter = next;
+                }
+            }
+
+            object->visited = 0;
+            return dump("}", 1, data);
+
+        object_error:
+            object->visited = 0;
+            return -1;
+        }
+
+        default:
+            /* not reached */
+            return -1;
+    }
+}
+
+
+char *json_dumps(const json_t *json, unsigned long flags)
+{
+    strbuffer_t strbuff;
+    char *result;
+
+    if(!json_is_array(json) && !json_is_object(json))
+        return NULL;
+
+    if(strbuffer_init(&strbuff))
+        return NULL;
+
+    if(do_dump(json, flags, 0, dump_to_strbuffer, (void *)&strbuff)) {
+        strbuffer_close(&strbuff);
+        return NULL;
+    }
+
+    result = strdup(strbuffer_value(&strbuff));
+    strbuffer_close(&strbuff);
+
+    return result;
+}
+
+int json_dumpf(const json_t *json, FILE *output, unsigned long flags)
+{
+    if(!json_is_array(json) && !json_is_object(json))
+        return -1;
+
+    return do_dump(json, flags, 0, dump_to_file, (void *)output);
+}
+
+int json_dump_file(const json_t *json, const char *path, unsigned long flags)
+{
+    int result;
+
+    FILE *output = fopen(path, "w");
+    if(!output)
+        return -1;
+
+    result = json_dumpf(json, output, flags);
+
+    fclose(output);
+    return result;
+}
diff --git a/compat/jansson/hashtable.c b/compat/jansson/hashtable.c
new file mode 100644
index 0000000..791f9ac
--- /dev/null
+++ b/compat/jansson/hashtable.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include "hashtable.h"
+
+#ifdef WIN32
+#define inline __inline
+#endif
+
+typedef struct hashtable_list list_t;
+typedef struct hashtable_pair pair_t;
+typedef struct hashtable_bucket bucket_t;
+
+#define container_of(ptr_, type_, member_)                      \
+    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))
+
+#define list_to_pair(list_)  container_of(list_, pair_t, list)
+
+static inline void list_init(list_t *list)
+{
+    list->next = list;
+    list->prev = list;
+}
+
+static inline void list_insert(list_t *list, list_t *node)
+{
+    node->next = list;
+    node->prev = list->prev;
+    list->prev->next = node;
+    list->prev = node;
+}
+
+static inline void list_remove(list_t *list)
+{
+    list->prev->next = list->next;
+    list->next->prev = list->prev;
+}
+
+static inline int bucket_is_empty(hashtable_t *hashtable, bucket_t *bucket)
+{
+    return bucket->first == &hashtable->list && bucket->first == bucket->last;
+}
+
+static void insert_to_bucket(hashtable_t *hashtable, bucket_t *bucket,
+                             list_t *list)
+{
+    if(bucket_is_empty(hashtable, bucket))
+    {
+        list_insert(&hashtable->list, list);
+        bucket->first = bucket->last = list;
+    }
+    else
+    {
+        list_insert(bucket->first, list);
+        bucket->first = list;
+    }
+}
+
+static unsigned int primes[] = {
+    5, 13, 23, 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593,
+    49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469,
+    12582917, 25165843, 50331653, 100663319, 201326611, 402653189,
+    805306457, 1610612741
+};
+static const unsigned int num_primes = sizeof(primes) / sizeof(unsigned int);
+
+static inline unsigned int num_buckets(hashtable_t *hashtable)
+{
+    return primes[hashtable->num_buckets];
+}
+
+
+static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket,
+                                   const void *key, unsigned int hash)
+{
+    list_t *list;
+    pair_t *pair;
+
+    if(bucket_is_empty(hashtable, bucket))
+        return NULL;
+
+    list = bucket->first;
+    while(1)
+    {
+        pair = list_to_pair(list);
+        if(pair->hash == hash && hashtable->cmp_keys(pair->key, key))
+            return pair;
+
+        if(list == bucket->last)
+            break;
+
+        list = list->next;
+    }
+
+    return NULL;
+}
+
+/* returns 0 on success, -1 if key was not found */
+static int hashtable_do_del(hashtable_t *hashtable,
+                            const void *key, unsigned int hash)
+{
+    pair_t *pair;
+    bucket_t *bucket;
+    unsigned int index;
+
+    index = hash % num_buckets(hashtable);
+    bucket = &hashtable->buckets[index];
+
+    pair = hashtable_find_pair(hashtable, bucket, key, hash);
+    if(!pair)
+        return -1;
+
+    if(&pair->list == bucket->first && &pair->list == bucket->last)
+        bucket->first = bucket->last = &hashtable->list;
+
+    else if(&pair->list == bucket->first)
+        bucket->first = pair->list.next;
+
+    else if(&pair->list == bucket->last)
+        bucket->last = pair->list.prev;
+
+    list_remove(&pair->list);
+
+    if(hashtable->free_key)
+        hashtable->free_key(pair->key);
+    if(hashtable->free_value)
+        hashtable->free_value(pair->value);
+
+    free(pair);
+    hashtable->size--;
+
+    return 0;
+}
+
+static void hashtable_do_clear(hashtable_t *hashtable)
+{
+    list_t *list, *next;
+    pair_t *pair;
+
+    for(list = hashtable->list.next; list != &hashtable->list; list = next)
+    {
+        next = list->next;
+        pair = list_to_pair(list);
+        if(hashtable->free_key)
+            hashtable->free_key(pair->key);
+        if(hashtable->free_value)
+            hashtable->free_value(pair->value);
+        free(pair);
+    }
+}
+
+static int hashtable_do_rehash(hashtable_t *hashtable)
+{
+    list_t *list, *next;
+    pair_t *pair;
+    unsigned int i, index, new_size;
+
+    free(hashtable->buckets);
+
+    hashtable->num_buckets++;
+    new_size = num_buckets(hashtable);
+
+    hashtable->buckets = malloc(new_size * sizeof(bucket_t));
+    if(!hashtable->buckets)
+        return -1;
+
+    for(i = 0; i < num_buckets(hashtable); i++)
+    {
+        hashtable->buckets[i].first = hashtable->buckets[i].last =
+            &hashtable->list;
+    }
+
+    list = hashtable->list.next;
+    list_init(&hashtable->list);
+
+    for(; list != &hashtable->list; list = next) {
+        next = list->next;
+        pair = list_to_pair(list);
+        index = pair->hash % new_size;
+        insert_to_bucket(hashtable, &hashtable->buckets[index], &pair->list);
+    }
+
+    return 0;
+}
+
+
+hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
+                              free_fn free_key, free_fn free_value)
+{
+    hashtable_t *hashtable = malloc(sizeof(hashtable_t));
+    if(!hashtable)
+        return NULL;
+
+    if(hashtable_init(hashtable, hash_key, cmp_keys, free_key, free_value))
+    {
+        free(hashtable);
+        return NULL;
+    }
+
+    return hashtable;
+}
+
+void hashtable_destroy(hashtable_t *hashtable)
+{
+    hashtable_close(hashtable);
+    free(hashtable);
+}
+
+int hashtable_init(hashtable_t *hashtable,
+                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
+                   free_fn free_key, free_fn free_value)
+{
+    unsigned int i;
+
+    hashtable->size = 0;
+    hashtable->num_buckets = 0;  /* index to primes[] */
+    hashtable->buckets = malloc(num_buckets(hashtable) * sizeof(bucket_t));
+    if(!hashtable->buckets)
+        return -1;
+
+    list_init(&hashtable->list);
+
+    hashtable->hash_key = hash_key;
+    hashtable->cmp_keys = cmp_keys;
+    hashtable->free_key = free_key;
+    hashtable->free_value = free_value;
+
+    for(i = 0; i < num_buckets(hashtable); i++)
+    {
+        hashtable->buckets[i].first = hashtable->buckets[i].last =
+            &hashtable->list;
+    }
+
+    return 0;
+}
+
+void hashtable_close(hashtable_t *hashtable)
+{
+    hashtable_do_clear(hashtable);
+    free(hashtable->buckets);
+}
+
+int hashtable_set(hashtable_t *hashtable, void *key, void *value)
+{
+    pair_t *pair;
+    bucket_t *bucket;
+    unsigned int hash, index;
+
+    /* rehash if the load ratio exceeds 1 */
+    if(hashtable->size >= num_buckets(hashtable))
+        if(hashtable_do_rehash(hashtable))
+            return -1;
+
+    hash = hashtable->hash_key(key);
+    index = hash % num_buckets(hashtable);
+    bucket = &hashtable->buckets[index];
+    pair = hashtable_find_pair(hashtable, bucket, key, hash);
+
+    if(pair)
+    {
+        if(hashtable->free_key)
+            hashtable->free_key(key);
+        if(hashtable->free_value)
+            hashtable->free_value(pair->value);
+        pair->value = value;
+    }
+    else
+    {
+        pair = malloc(sizeof(pair_t));
+        if(!pair)
+            return -1;
+
+        pair->key = key;
+        pair->value = value;
+        pair->hash = hash;
+        list_init(&pair->list);
+
+        insert_to_bucket(hashtable, bucket, &pair->list);
+
+        hashtable->size++;
+    }
+    return 0;
+}
+
+void *hashtable_get(hashtable_t *hashtable, const void *key)
+{
+    pair_t *pair;
+    unsigned int hash;
+    bucket_t *bucket;
+
+    hash = hashtable->hash_key(key);
+    bucket = &hashtable->buckets[hash % num_buckets(hashtable)];
+
+    pair = hashtable_find_pair(hashtable, bucket, key, hash);
+    if(!pair)
+        return NULL;
+
+    return pair->value;
+}
+
+int hashtable_del(hashtable_t *hashtable, const void *key)
+{
+    unsigned int hash = hashtable->hash_key(key);
+    return hashtable_do_del(hashtable, key, hash);
+}
+
+void hashtable_clear(hashtable_t *hashtable)
+{
+    unsigned int i;
+
+    hashtable_do_clear(hashtable);
+
+    for(i = 0; i < num_buckets(hashtable); i++)
+    {
+        hashtable->buckets[i].first = hashtable->buckets[i].last =
+            &hashtable->list;
+    }
+
+    list_init(&hashtable->list);
+    hashtable->size = 0;
+}
+
+void *hashtable_iter(hashtable_t *hashtable)
+{
+    return hashtable_iter_next(hashtable, &hashtable->list);
+}
+
+void *hashtable_iter_at(hashtable_t *hashtable, const void *key)
+{
+    pair_t *pair;
+    unsigned int hash;
+    bucket_t *bucket;
+
+    hash = hashtable->hash_key(key);
+    bucket = &hashtable->buckets[hash % num_buckets(hashtable)];
+
+    pair = hashtable_find_pair(hashtable, bucket, key, hash);
+    if(!pair)
+        return NULL;
+
+    return &pair->list;
+}
+
+void *hashtable_iter_next(hashtable_t *hashtable, void *iter)
+{
+    list_t *list = (list_t *)iter;
+    if(list->next == &hashtable->list)
+        return NULL;
+    return list->next;
+}
+
+void *hashtable_iter_key(void *iter)
+{
+    pair_t *pair = list_to_pair((list_t *)iter);
+    return pair->key;
+}
+
+void *hashtable_iter_value(void *iter)
+{
+    pair_t *pair = list_to_pair((list_t *)iter);
+    return pair->value;
+}
+
+void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value)
+{
+    pair_t *pair = list_to_pair((list_t *)iter);
+
+    if(hashtable->free_value)
+        hashtable->free_value(pair->value);
+
+    pair->value = value;
+}
diff --git a/compat/jansson/hashtable.h b/compat/jansson/hashtable.h
new file mode 100644
index 0000000..52f8549
--- /dev/null
+++ b/compat/jansson/hashtable.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef HASHTABLE_H
+#define HASHTABLE_H
+
+typedef unsigned int (*key_hash_fn)(const void *key);
+typedef int (*key_cmp_fn)(const void *key1, const void *key2);
+typedef void (*free_fn)(void *key);
+
+struct hashtable_list {
+    struct hashtable_list *prev;
+    struct hashtable_list *next;
+};
+
+struct hashtable_pair {
+    void *key;
+    void *value;
+    unsigned int hash;
+    struct hashtable_list list;
+};
+
+struct hashtable_bucket {
+    struct hashtable_list *first;
+    struct hashtable_list *last;
+};
+
+typedef struct hashtable {
+    unsigned int size;
+    struct hashtable_bucket *buckets;
+    unsigned int num_buckets;  /* index to primes[] */
+    struct hashtable_list list;
+
+    key_hash_fn hash_key;
+    key_cmp_fn cmp_keys;  /* returns non-zero for equal keys */
+    free_fn free_key;
+    free_fn free_value;
+} hashtable_t;
+
+/**
+ * hashtable_create - Create a hashtable object
+ *
+ * @hash_key: The key hashing function
+ * @cmp_keys: The key compare function. Returns non-zero for equal and
+ *     zero for unequal unequal keys
+ * @free_key: If non-NULL, called for a key that is no longer referenced.
+ * @free_value: If non-NULL, called for a value that is no longer referenced.
+ *
+ * Returns a new hashtable object that should be freed with
+ * hashtable_destroy when it's no longer used, or NULL on failure (out
+ * of memory).
+ */
+hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
+                              free_fn free_key, free_fn free_value);
+
+/**
+ * hashtable_destroy - Destroy a hashtable object
+ *
+ * @hashtable: The hashtable
+ *
+ * Destroys a hashtable created with hashtable_create().
+ */
+void hashtable_destroy(hashtable_t *hashtable);
+
+/**
+ * hashtable_init - Initialize a hashtable object
+ *
+ * @hashtable: The (statically allocated) hashtable object
+ * @hash_key: The key hashing function
+ * @cmp_keys: The key compare function. Returns non-zero for equal and
+ *     zero for unequal unequal keys
+ * @free_key: If non-NULL, called for a key that is no longer referenced.
+ * @free_value: If non-NULL, called for a value that is no longer referenced.
+ *
+ * Initializes a statically allocated hashtable object. The object
+ * should be cleared with hashtable_close when it's no longer used.
+ *
+ * Returns 0 on success, -1 on error (out of memory).
+ */
+int hashtable_init(hashtable_t *hashtable,
+                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
+                   free_fn free_key, free_fn free_value);
+
+/**
+ * hashtable_close - Release all resources used by a hashtable object
+ *
+ * @hashtable: The hashtable
+ *
+ * Destroys a statically allocated hashtable object.
+ */
+void hashtable_close(hashtable_t *hashtable);
+
+/**
+ * hashtable_set - Add/modify value in hashtable
+ *
+ * @hashtable: The hashtable object
+ * @key: The key
+ * @value: The value
+ *
+ * If a value with the given key already exists, its value is replaced
+ * with the new value.
+ *
+ * Key and value are "stealed" in the sense that hashtable frees them
+ * automatically when they are no longer used. The freeing is
+ * accomplished by calling free_key and free_value functions that were
+ * supplied to hashtable_new. In case one or both of the free
+ * functions is NULL, the corresponding item is not "stealed".
+ *
+ * Returns 0 on success, -1 on failure (out of memory).
+ */
+int hashtable_set(hashtable_t *hashtable, void *key, void *value);
+
+/**
+ * hashtable_get - Get a value associated with a key
+ *
+ * @hashtable: The hashtable object
+ * @key: The key
+ *
+ * Returns value if it is found, or NULL otherwise.
+ */
+void *hashtable_get(hashtable_t *hashtable, const void *key);
+
+/**
+ * hashtable_del - Remove a value from the hashtable
+ *
+ * @hashtable: The hashtable object
+ * @key: The key
+ *
+ * Returns 0 on success, or -1 if the key was not found.
+ */
+int hashtable_del(hashtable_t *hashtable, const void *key);
+
+/**
+ * hashtable_clear - Clear hashtable
+ *
+ * @hashtable: The hashtable object
+ *
+ * Removes all items from the hashtable.
+ */
+void hashtable_clear(hashtable_t *hashtable);
+
+/**
+ * hashtable_iter - Iterate over hashtable
+ *
+ * @hashtable: The hashtable object
+ *
+ * Returns an opaque iterator to the first element in the hashtable.
+ * The iterator should be passed to hashtable_iter_* functions.
+ * The hashtable items are not iterated over in any particular order.
+ *
+ * There's no need to free the iterator in any way. The iterator is
+ * valid as long as the item that is referenced by the iterator is not
+ * deleted. Other values may be added or deleted. In particular,
+ * hashtable_iter_next() may be called on an iterator, and after that
+ * the key/value pair pointed by the old iterator may be deleted.
+ */
+void *hashtable_iter(hashtable_t *hashtable);
+
+/**
+ * hashtable_iter_at - Return an iterator at a specific key
+ *
+ * @hashtable: The hashtable object
+ * @key: The key that the iterator should point to
+ *
+ * Like hashtable_iter() but returns an iterator pointing to a
+ * specific key.
+ */
+void *hashtable_iter_at(hashtable_t *hashtable, const void *key);
+
+/**
+ * hashtable_iter_next - Advance an iterator
+ *
+ * @hashtable: The hashtable object
+ * @iter: The iterator
+ *
+ * Returns a new iterator pointing to the next element in the
+ * hashtable or NULL if the whole hastable has been iterated over.
+ */
+void *hashtable_iter_next(hashtable_t *hashtable, void *iter);
+
+/**
+ * hashtable_iter_key - Retrieve the key pointed by an iterator
+ *
+ * @iter: The iterator
+ */
+void *hashtable_iter_key(void *iter);
+
+/**
+ * hashtable_iter_value - Retrieve the value pointed by an iterator
+ *
+ * @iter: The iterator
+ */
+void *hashtable_iter_value(void *iter);
+
+/**
+ * hashtable_iter_set - Set the value pointed by an iterator
+ *
+ * @iter: The iterator
+ * @value: The value to set
+ */
+void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value);
+
+#endif
diff --git a/compat/jansson/jansson.h b/compat/jansson/jansson.h
new file mode 100644
index 0000000..781896c
--- /dev/null
+++ b/compat/jansson/jansson.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef JANSSON_H
+#define JANSSON_H
+
+#include <stdio.h>
+
+#ifdef WIN32
+#define snprintf(...) _snprintf(__VA_ARGS__)
+#define strdup(x) _strdup(x)
+#endif
+
+#ifdef WIN32
+#define inline __inline
+#endif
+
+#ifndef __cplusplus
+#define JSON_INLINE inline
+#else
+#define JSON_INLINE inline
+extern "C" {
+#endif
+
+/* types */
+
+typedef enum {
+    JSON_OBJECT,
+    JSON_ARRAY,
+    JSON_STRING,
+    JSON_INTEGER,
+    JSON_REAL,
+    JSON_TRUE,
+    JSON_FALSE,
+    JSON_NULL
+} json_type;
+
+typedef struct {
+    json_type type;
+    unsigned long refcount;
+} json_t;
+
+#define json_typeof(json)      ((json)->type)
+#define json_is_object(json)   (json && json_typeof(json) == JSON_OBJECT)
+#define json_is_array(json)    (json && json_typeof(json) == JSON_ARRAY)
+#define json_is_string(json)   (json && json_typeof(json) == JSON_STRING)
+#define json_is_integer(json)  (json && json_typeof(json) == JSON_INTEGER)
+#define json_is_real(json)     (json && json_typeof(json) == JSON_REAL)
+#define json_is_number(json)   (json_is_integer(json) || json_is_real(json))
+#define json_is_true(json)     (json && json_typeof(json) == JSON_TRUE)
+#define json_is_false(json)    (json && json_typeof(json) == JSON_FALSE)
+#define json_is_boolean(json)  (json_is_true(json) || json_is_false(json))
+#define json_is_null(json)     (json && json_typeof(json) == JSON_NULL)
+
+/* construction, destruction, reference counting */
+
+json_t *json_object(void);
+json_t *json_array(void);
+json_t *json_string(const char *value);
+json_t *json_string_nocheck(const char *value);
+json_t *json_integer(int value);
+json_t *json_real(double value);
+json_t *json_true(void);
+json_t *json_false(void);
+json_t *json_null(void);
+
+static JSON_INLINE
+json_t *json_incref(json_t *json)
+{
+    if(json && json->refcount != (unsigned int)-1)
+        ++json->refcount;
+    return json;
+}
+
+/* do not call json_delete directly */
+void json_delete(json_t *json);
+
+static JSON_INLINE
+void json_decref(json_t *json)
+{
+    if(json && json->refcount != (unsigned int)-1 && --json->refcount == 0)
+        json_delete(json);
+}
+
+
+/* getters, setters, manipulation */
+
+unsigned int json_object_size(const json_t *object);
+json_t *json_object_get(const json_t *object, const char *key);
+int json_object_set_new(json_t *object, const char *key, json_t *value);
+int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value);
+int json_object_del(json_t *object, const char *key);
+int json_object_clear(json_t *object);
+int json_object_update(json_t *object, json_t *other);
+void *json_object_iter(json_t *object);
+void *json_object_iter_at(json_t *object, const char *key);
+void *json_object_iter_next(json_t *object, void *iter);
+const char *json_object_iter_key(void *iter);
+json_t *json_object_iter_value(void *iter);
+int json_object_iter_set_new(json_t *object, void *iter, json_t *value);
+
+static JSON_INLINE
+int json_object_set(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new(object, key, json_incref(value));
+}
+
+static JSON_INLINE
+int json_object_set_nocheck(json_t *object, const char *key, json_t *value)
+{
+    return json_object_set_new_nocheck(object, key, json_incref(value));
+}
+
+static inline
+int json_object_iter_set(json_t *object, void *iter, json_t *value)
+{
+    return json_object_iter_set_new(object, iter, json_incref(value));
+}
+
+unsigned int json_array_size(const json_t *array);
+json_t *json_array_get(const json_t *array, unsigned int index);
+int json_array_set_new(json_t *array, unsigned int index, json_t *value);
+int json_array_append_new(json_t *array, json_t *value);
+int json_array_insert_new(json_t *array, unsigned int index, json_t *value);
+int json_array_remove(json_t *array, unsigned int index);
+int json_array_clear(json_t *array);
+int json_array_extend(json_t *array, json_t *other);
+
+static JSON_INLINE
+int json_array_set(json_t *array, unsigned int index, json_t *value)
+{
+    return json_array_set_new(array, index, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_append(json_t *array, json_t *value)
+{
+    return json_array_append_new(array, json_incref(value));
+}
+
+static JSON_INLINE
+int json_array_insert(json_t *array, unsigned int index, json_t *value)
+{
+    return json_array_insert_new(array, index, json_incref(value));
+}
+
+const char *json_string_value(const json_t *string);
+int json_integer_value(const json_t *integer);
+double json_real_value(const json_t *real);
+double json_number_value(const json_t *json);
+
+int json_string_set(json_t *string, const char *value);
+int json_string_set_nocheck(json_t *string, const char *value);
+int json_integer_set(json_t *integer, int value);
+int json_real_set(json_t *real, double value);
+
+
+/* equality */
+
+int json_equal(json_t *value1, json_t *value2);
+
+
+/* copying */
+
+json_t *json_copy(json_t *value);
+json_t *json_deep_copy(json_t *value);
+
+
+/* loading, printing */
+
+#define JSON_ERROR_TEXT_LENGTH  160
+
+typedef struct {
+    char text[JSON_ERROR_TEXT_LENGTH];
+    int line;
+} json_error_t;
+
+json_t *json_loads(const char *input, json_error_t *error);
+json_t *json_loadf(FILE *input, json_error_t *error);
+json_t *json_load_file(const char *path, json_error_t *error);
+
+#define JSON_INDENT(n)      (n & 0xFF)
+#define JSON_COMPACT        0x100
+#define JSON_ENSURE_ASCII   0x200
+#define JSON_SORT_KEYS      0x400
+#define JSON_PRESERVE_ORDER 0x800
+
+char *json_dumps(const json_t *json, unsigned long flags);
+int json_dumpf(const json_t *json, FILE *output, unsigned long flags);
+int json_dump_file(const json_t *json, const char *path, unsigned long flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/jansson/jansson_private.h b/compat/jansson/jansson_private.h
new file mode 100644
index 0000000..3a3ed75
--- /dev/null
+++ b/compat/jansson/jansson_private.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef JANSSON_PRIVATE_H
+#define JANSSON_PRIVATE_H
+
+#include "jansson.h"
+#include "hashtable.h"
+
+#define container_of(ptr_, type_, member_)  \
+    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))
+
+typedef struct {
+    json_t json;
+    hashtable_t hashtable;
+    unsigned long serial;
+    int visited;
+} json_object_t;
+
+typedef struct {
+    json_t json;
+    unsigned int size;
+    unsigned int entries;
+    json_t **table;
+    int visited;
+} json_array_t;
+
+typedef struct {
+    json_t json;
+    char *value;
+} json_string_t;
+
+typedef struct {
+    json_t json;
+    double value;
+} json_real_t;
+
+typedef struct {
+    json_t json;
+    int value;
+} json_integer_t;
+
+#define json_to_object(json_)  container_of(json_, json_object_t, json)
+#define json_to_array(json_)   container_of(json_, json_array_t, json)
+#define json_to_string(json_)  container_of(json_, json_string_t, json)
+#define json_to_real(json_)   container_of(json_, json_real_t, json)
+#define json_to_integer(json_) container_of(json_, json_integer_t, json)
+
+typedef struct {
+    unsigned long serial;
+    char key[];
+} object_key_t;
+
+const object_key_t *jsonp_object_iter_fullkey(void *iter);
+
+#endif
diff --git a/compat/jansson/load.c b/compat/jansson/load.c
new file mode 100644
index 0000000..ee56fbe
--- /dev/null
+++ b/compat/jansson/load.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <jansson.h>
+#include "jansson_private.h"
+#include "strbuffer.h"
+#include "utf.h"
+
+#define TOKEN_INVALID         -1
+#define TOKEN_EOF              0
+#define TOKEN_STRING         256
+#define TOKEN_INTEGER        257
+#define TOKEN_REAL           258
+#define TOKEN_TRUE           259
+#define TOKEN_FALSE          260
+#define TOKEN_NULL           261
+
+/* read one byte from stream, return EOF on end of file */
+typedef int (*get_func)(void *data);
+
+/* return non-zero if end of file has been reached */
+typedef int (*eof_func)(void *data);
+
+typedef struct {
+    get_func get;
+    eof_func eof;
+    void *data;
+    int stream_pos;
+    char buffer[5];
+    int buffer_pos;
+} stream_t;
+
+
+typedef struct {
+    stream_t stream;
+    strbuffer_t saved_text;
+    int token;
+    int line, column;
+    union {
+        char *string;
+        int integer;
+        double real;
+    } value;
+} lex_t;
+
+
+/*** error reporting ***/
+
+static void error_init(json_error_t *error)
+{
+    if(error)
+    {
+        error->text[0] = '\0';
+        error->line = -1;
+    }
+}
+
+static void error_set(json_error_t *error, const lex_t *lex,
+                      const char *msg, ...)
+{
+    va_list ap;
+    char text[JSON_ERROR_TEXT_LENGTH];
+
+    if(!error || error->text[0] != '\0') {
+        /* error already set */
+        return;
+    }
+
+    va_start(ap, msg);
+    vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap);
+    va_end(ap);
+
+    if(lex)
+    {
+        const char *saved_text = strbuffer_value(&lex->saved_text);
+        error->line = lex->line;
+        if(saved_text && saved_text[0])
+        {
+            if(lex->saved_text.length <= 20) {
+                snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
+                         "%s near '%s'", text, saved_text);
+            }
+            else
+                snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
+        }
+        else
+        {
+            snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
+                     "%s near end of file", text);
+        }
+    }
+    else
+    {
+        error->line = -1;
+        snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
+    }
+}
+
+
+/*** lexical analyzer ***/
+
+static void
+stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
+{
+    stream->get = get;
+    stream->eof = eof;
+    stream->data = data;
+    stream->stream_pos = 0;
+    stream->buffer[0] = '\0';
+    stream->buffer_pos = 0;
+}
+
+static char stream_get(stream_t *stream, json_error_t *error)
+{
+    char c;
+
+    if(!stream->buffer[stream->buffer_pos])
+    {
+        stream->buffer[0] = stream->get(stream->data);
+        stream->buffer_pos = 0;
+
+        c = stream->buffer[0];
+
+        if((unsigned char)c >= 0x80 && c != (char)EOF)
+        {
+            /* multi-byte UTF-8 sequence */
+            int i, count;
+
+            count = utf8_check_first(c);
+            if(!count)
+                goto out;
+
+            assert(count >= 2);
+
+            for(i = 1; i < count; i++)
+                stream->buffer[i] = stream->get(stream->data);
+
+            if(!utf8_check_full(stream->buffer, count, NULL))
+                goto out;
+
+            stream->stream_pos += count;
+            stream->buffer[count] = '\0';
+        }
+        else {
+            stream->buffer[1] = '\0';
+            stream->stream_pos++;
+        }
+    }
+
+    return stream->buffer[stream->buffer_pos++];
+
+out:
+    error_set(error, NULL, "unable to decode byte 0x%x at position %d",
+              (unsigned char)c, stream->stream_pos);
+
+    stream->buffer[0] = EOF;
+    stream->buffer[1] = '\0';
+    stream->buffer_pos = 1;
+
+    return EOF;
+}
+
+static void stream_unget(stream_t *stream, char c)
+{
+    assert(stream->buffer_pos > 0);
+    stream->buffer_pos--;
+    assert(stream->buffer[stream->buffer_pos] == c);
+}
+
+
+static int lex_get(lex_t *lex, json_error_t *error)
+{
+    return stream_get(&lex->stream, error);
+}
+
+static int lex_eof(lex_t *lex)
+{
+    return lex->stream.eof(lex->stream.data);
+}
+
+static void lex_save(lex_t *lex, char c)
+{
+    strbuffer_append_byte(&lex->saved_text, c);
+}
+
+static int lex_get_save(lex_t *lex, json_error_t *error)
+{
+    char c = stream_get(&lex->stream, error);
+    lex_save(lex, c);
+    return c;
+}
+
+static void lex_unget_unsave(lex_t *lex, char c)
+{
+    char d;
+    stream_unget(&lex->stream, c);
+    d = strbuffer_pop(&lex->saved_text);
+    assert(c == d);
+}
+
+static void lex_save_cached(lex_t *lex)
+{
+    while(lex->stream.buffer[lex->stream.buffer_pos] != '\0')
+    {
+        lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]);
+        lex->stream.buffer_pos++;
+    }
+}
+
+/* assumes that str points to 'u' plus at least 4 valid hex digits */
+static int32_t decode_unicode_escape(const char *str)
+{
+    int i;
+    int32_t value = 0;
+
+    assert(str[0] == 'u');
+
+    for(i = 1; i <= 4; i++) {
+        char c = str[i];
+        value <<= 4;
+        if(isdigit(c))
+            value += c - '0';
+        else if(islower(c))
+            value += c - 'a' + 10;
+        else if(isupper(c))
+            value += c - 'A' + 10;
+        else
+            assert(0);
+    }
+
+    return value;
+}
+
+static void lex_scan_string(lex_t *lex, json_error_t *error)
+{
+    char c;
+    const char *p;
+    char *t;
+    int i;
+
+    lex->value.string = NULL;
+    lex->token = TOKEN_INVALID;
+
+    c = lex_get_save(lex, error);
+
+    while(c != '"') {
+        if(c == (char)EOF) {
+            lex_unget_unsave(lex, c);
+            if(lex_eof(lex))
+                error_set(error, lex, "premature end of input");
+            goto out;
+        }
+
+        else if((unsigned char)c <= 0x1F) {
+            /* control character */
+            lex_unget_unsave(lex, c);
+            if(c == '\n')
+                error_set(error, lex, "unexpected newline", c);
+            else
+                error_set(error, lex, "control character 0x%x", c);
+            goto out;
+        }
+
+        else if(c == '\\') {
+            c = lex_get_save(lex, error);
+            if(c == 'u') {
+                c = lex_get_save(lex, error);
+                for(i = 0; i < 4; i++) {
+                    if(!isxdigit(c)) {
+                        lex_unget_unsave(lex, c);
+                        error_set(error, lex, "invalid escape");
+                        goto out;
+                    }
+                    c = lex_get_save(lex, error);
+                }
+            }
+            else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
+                    c == 'f' || c == 'n' || c == 'r' || c == 't')
+                c = lex_get_save(lex, error);
+            else {
+                lex_unget_unsave(lex, c);
+                error_set(error, lex, "invalid escape");
+                goto out;
+            }
+        }
+        else
+            c = lex_get_save(lex, error);
+    }
+
+    /* the actual value is at most of the same length as the source
+       string, because:
+         - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte
+         - a single \uXXXX escape (length 6) is converted to at most 3 bytes
+         - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
+           are converted to 4 bytes
+    */
+    lex->value.string = malloc(lex->saved_text.length + 1);
+    if(!lex->value.string) {
+        /* this is not very nice, since TOKEN_INVALID is returned */
+        goto out;
+    }
+
+    /* the target */
+    t = lex->value.string;
+
+    /* + 1 to skip the " */
+    p = strbuffer_value(&lex->saved_text) + 1;
+
+    while(*p != '"') {
+        if(*p == '\\') {
+            p++;
+            if(*p == 'u') {
+                char buffer[4];
+                int length;
+                int32_t value;
+
+                value = decode_unicode_escape(p);
+                p += 5;
+
+                if(0xD800 <= value && value <= 0xDBFF) {
+                    /* surrogate pair */
+                    if(*p == '\\' && *(p + 1) == 'u') {
+                        int32_t value2 = decode_unicode_escape(++p);
+                        p += 5;
+
+                        if(0xDC00 <= value2 && value2 <= 0xDFFF) {
+                            /* valid second surrogate */
+                            value =
+                                ((value - 0xD800) << 10) +
+                                (value2 - 0xDC00) +
+                                0x10000;
+                        }
+                        else {
+                            /* invalid second surrogate */
+                            error_set(error, lex,
+                                      "invalid Unicode '\\u%04X\\u%04X'",
+                                      value, value2);
+                            goto out;
+                        }
+                    }
+                    else {
+                        /* no second surrogate */
+                        error_set(error, lex, "invalid Unicode '\\u%04X'",
+                                  value);
+                        goto out;
+                    }
+                }
+                else if(0xDC00 <= value && value <= 0xDFFF) {
+                    error_set(error, lex, "invalid Unicode '\\u%04X'", value);
+                    goto out;
+                }
+                else if(value == 0)
+                {
+                    error_set(error, lex, "\\u0000 is not allowed");
+                    goto out;
+                }
+
+                if(utf8_encode(value, buffer, &length))
+                    assert(0);
+
+                memcpy(t, buffer, length);
+                t += length;
+            }
+            else {
+                switch(*p) {
+                    case '"': case '\\': case '/':
+                        *t = *p; break;
+                    case 'b': *t = '\b'; break;
+                    case 'f': *t = '\f'; break;
+                    case 'n': *t = '\n'; break;
+                    case 'r': *t = '\r'; break;
+                    case 't': *t = '\t'; break;
+                    default: assert(0);
+                }
+                t++;
+                p++;
+            }
+        }
+        else
+            *(t++) = *(p++);
+    }
+    *t = '\0';
+    lex->token = TOKEN_STRING;
+    return;
+
+out:
+    free(lex->value.string);
+}
+
+static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
+{
+    const char *saved_text;
+    char *end;
+    double value;
+
+    lex->token = TOKEN_INVALID;
+
+    if(c == '-')
+        c = lex_get_save(lex, error);
+
+    if(c == '0') {
+        c = lex_get_save(lex, error);
+        if(isdigit(c)) {
+            lex_unget_unsave(lex, c);
+            goto out;
+        }
+    }
+    else if(isdigit(c)) {
+        c = lex_get_save(lex, error);
+        while(isdigit(c))
+            c = lex_get_save(lex, error);
+    }
+    else {
+      lex_unget_unsave(lex, c);
+      goto out;
+    }
+
+    if(c != '.' && c != 'E' && c != 'e') {
+        long value;
+
+        lex_unget_unsave(lex, c);
+
+        saved_text = strbuffer_value(&lex->saved_text);
+        value = strtol(saved_text, &end, 10);
+        assert(end == saved_text + lex->saved_text.length);
+
+        if((value == LONG_MAX && errno == ERANGE) || value > INT_MAX) {
+            error_set(error, lex, "too big integer");
+            goto out;
+        }
+        else if((value == LONG_MIN && errno == ERANGE) || value < INT_MIN) {
+            error_set(error, lex, "too big negative integer");
+            goto out;
+        }
+
+        lex->token = TOKEN_INTEGER;
+        lex->value.integer = (int)value;
+        return 0;
+    }
+
+    if(c == '.') {
+        c = lex_get(lex, error);
+        if(!isdigit(c))
+            goto out;
+        lex_save(lex, c);
+
+        c = lex_get_save(lex, error);
+        while(isdigit(c))
+            c = lex_get_save(lex, error);
+    }
+
+    if(c == 'E' || c == 'e') {
+        c = lex_get_save(lex, error);
+        if(c == '+' || c == '-')
+            c = lex_get_save(lex, error);
+
+        if(!isdigit(c)) {
+            lex_unget_unsave(lex, c);
+            goto out;
+        }
+
+        c = lex_get_save(lex, error);
+        while(isdigit(c))
+            c = lex_get_save(lex, error);
+    }
+
+    lex_unget_unsave(lex, c);
+
+    saved_text = strbuffer_value(&lex->saved_text);
+    value = strtod(saved_text, &end);
+    assert(end == saved_text + lex->saved_text.length);
+
+    if(errno == ERANGE && value != 0) {
+        error_set(error, lex, "real number overflow");
+        goto out;
+    }
+
+    lex->token = TOKEN_REAL;
+    lex->value.real = value;
+    return 0;
+
+out:
+    return -1;
+}
+
+static int lex_scan(lex_t *lex, json_error_t *error)
+{
+    char c;
+
+    strbuffer_clear(&lex->saved_text);
+
+    if(lex->token == TOKEN_STRING) {
+        free(lex->value.string);
+        lex->value.string = NULL;
+    }
+
+    c = lex_get(lex, error);
+    while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
+    {
+        if(c == '\n')
+            lex->line++;
+
+        c = lex_get(lex, error);
+    }
+
+    if(c == (char)EOF) {
+        if(lex_eof(lex))
+            lex->token = TOKEN_EOF;
+        else
+            lex->token = TOKEN_INVALID;
+        goto out;
+    }
+
+    lex_save(lex, c);
+
+    if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',')
+        lex->token = c;
+
+    else if(c == '"')
+        lex_scan_string(lex, error);
+
+    else if(isdigit(c) || c == '-') {
+        if(lex_scan_number(lex, c, error))
+            goto out;
+    }
+
+    else if(isupper(c) || islower(c)) {
+        /* eat up the whole identifier for clearer error messages */
+        const char *saved_text;
+
+        c = lex_get_save(lex, error);
+        while(isupper(c) || islower(c))
+            c = lex_get_save(lex, error);
+        lex_unget_unsave(lex, c);
+
+        saved_text = strbuffer_value(&lex->saved_text);
+
+        if(strcmp(saved_text, "true") == 0)
+            lex->token = TOKEN_TRUE;
+        else if(strcmp(saved_text, "false") == 0)
+            lex->token = TOKEN_FALSE;
+        else if(strcmp(saved_text, "null") == 0)
+            lex->token = TOKEN_NULL;
+        else
+            lex->token = TOKEN_INVALID;
+    }
+
+    else {
+        /* save the rest of the input UTF-8 sequence to get an error
+           message of valid UTF-8 */
+        lex_save_cached(lex);
+        lex->token = TOKEN_INVALID;
+    }
+
+out:
+    return lex->token;
+}
+
+static char *lex_steal_string(lex_t *lex)
+{
+    char *result = NULL;
+    if(lex->token == TOKEN_STRING)
+    {
+        result = lex->value.string;
+        lex->value.string = NULL;
+    }
+    return result;
+}
+
+static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data)
+{
+    stream_init(&lex->stream, get, eof, data);
+    if(strbuffer_init(&lex->saved_text))
+        return -1;
+
+    lex->token = TOKEN_INVALID;
+    lex->line = 1;
+
+    return 0;
+}
+
+static void lex_close(lex_t *lex)
+{
+    if(lex->token == TOKEN_STRING)
+        free(lex->value.string);
+    strbuffer_close(&lex->saved_text);
+}
+
+
+/*** parser ***/
+
+static json_t *parse_value(lex_t *lex, json_error_t *error);
+
+static json_t *parse_object(lex_t *lex, json_error_t *error)
+{
+    json_t *object = json_object();
+    if(!object)
+        return NULL;
+
+    lex_scan(lex, error);
+    if(lex->token == '}')
+        return object;
+
+    while(1) {
+        char *key;
+        json_t *value;
+
+        if(lex->token != TOKEN_STRING) {
+            error_set(error, lex, "string or '}' expected");
+            goto error;
+        }
+
+        key = lex_steal_string(lex);
+        if(!key)
+            return NULL;
+
+        lex_scan(lex, error);
+        if(lex->token != ':') {
+            free(key);
+            error_set(error, lex, "':' expected");
+            goto error;
+        }
+
+        lex_scan(lex, error);
+        value = parse_value(lex, error);
+        if(!value) {
+            free(key);
+            goto error;
+        }
+
+        if(json_object_set_nocheck(object, key, value)) {
+            free(key);
+            json_decref(value);
+            goto error;
+        }
+
+        json_decref(value);
+        free(key);
+
+        lex_scan(lex, error);
+        if(lex->token != ',')
+            break;
+
+        lex_scan(lex, error);
+    }
+
+    if(lex->token != '}') {
+        error_set(error, lex, "'}' expected");
+        goto error;
+    }
+
+    return object;
+
+error:
+    json_decref(object);
+    return NULL;
+}
+
+static json_t *parse_array(lex_t *lex, json_error_t *error)
+{
+    json_t *array = json_array();
+    if(!array)
+        return NULL;
+
+    lex_scan(lex, error);
+    if(lex->token == ']')
+        return array;
+
+    while(lex->token) {
+        json_t *elem = parse_value(lex, error);
+        if(!elem)
+            goto error;
+
+        if(json_array_append(array, elem)) {
+            json_decref(elem);
+            goto error;
+        }
+        json_decref(elem);
+
+        lex_scan(lex, error);
+        if(lex->token != ',')
+            break;
+
+        lex_scan(lex, error);
+    }
+
+    if(lex->token != ']') {
+        error_set(error, lex, "']' expected");
+        goto error;
+    }
+
+    return array;
+
+error:
+    json_decref(array);
+    return NULL;
+}
+
+static json_t *parse_value(lex_t *lex, json_error_t *error)
+{
+    json_t *json;
+
+    switch(lex->token) {
+        case TOKEN_STRING: {
+            json = json_string_nocheck(lex->value.string);
+            break;
+        }
+
+        case TOKEN_INTEGER: {
+            json = json_integer(lex->value.integer);
+            break;
+        }
+
+        case TOKEN_REAL: {
+            json = json_real(lex->value.real);
+            break;
+        }
+
+        case TOKEN_TRUE:
+            json = json_true();
+            break;
+
+        case TOKEN_FALSE:
+            json = json_false();
+            break;
+
+        case TOKEN_NULL:
+            json = json_null();
+            break;
+
+        case '{':
+            json = parse_object(lex, error);
+            break;
+
+        case '[':
+            json = parse_array(lex, error);
+            break;
+
+        case TOKEN_INVALID:
+            error_set(error, lex, "invalid token");
+            return NULL;
+
+        default:
+            error_set(error, lex, "unexpected token");
+            return NULL;
+    }
+
+    if(!json)
+        return NULL;
+
+    return json;
+}
+
+static json_t *parse_json(lex_t *lex, json_error_t *error)
+{
+    error_init(error);
+
+    lex_scan(lex, error);
+    if(lex->token != '[' && lex->token != '{') {
+        error_set(error, lex, "'[' or '{' expected");
+        return NULL;
+    }
+
+    return parse_value(lex, error);
+}
+
+typedef struct
+{
+    const char *data;
+    int pos;
+} string_data_t;
+
+static int string_get(void *data)
+{
+    char c;
+    string_data_t *stream = (string_data_t *)data;
+    c = stream->data[stream->pos];
+    if(c == '\0')
+        return EOF;
+    else
+    {
+        stream->pos++;
+        return c;
+    }
+}
+
+static int string_eof(void *data)
+{
+    string_data_t *stream = (string_data_t *)data;
+    return (stream->data[stream->pos] == '\0');
+}
+
+json_t *json_loads(const char *string, json_error_t *error)
+{
+    lex_t lex;
+    json_t *result;
+
+    string_data_t stream_data = {
+        string,
+        0
+    };
+
+    if(lex_init(&lex, string_get, string_eof, (void *)&stream_data))
+        return NULL;
+
+    result = parse_json(&lex, error);
+    if(!result)
+        goto out;
+
+    lex_scan(&lex, error);
+    if(lex.token != TOKEN_EOF) {
+        error_set(error, &lex, "end of file expected");
+        json_decref(result);
+        result = NULL;
+    }
+
+out:
+    lex_close(&lex);
+    return result;
+}
+
+json_t *json_loadf(FILE *input, json_error_t *error)
+{
+    lex_t lex;
+    json_t *result;
+
+    if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input))
+        return NULL;
+
+    result = parse_json(&lex, error);
+    if(!result)
+        goto out;
+
+    lex_scan(&lex, error);
+    if(lex.token != TOKEN_EOF) {
+        error_set(error, &lex, "end of file expected");
+        json_decref(result);
+        result = NULL;
+    }
+
+out:
+    lex_close(&lex);
+    return result;
+}
+
+json_t *json_load_file(const char *path, json_error_t *error)
+{
+    json_t *result;
+    FILE *fp;
+
+    error_init(error);
+
+    fp = fopen(path, "r");
+    if(!fp)
+    {
+        error_set(error, NULL, "unable to open %s: %s",
+                  path, strerror(errno));
+        return NULL;
+    }
+
+    result = json_loadf(fp, error);
+
+    fclose(fp);
+    return result;
+}
diff --git a/compat/jansson/strbuffer.c b/compat/jansson/strbuffer.c
new file mode 100644
index 0000000..0019645
--- /dev/null
+++ b/compat/jansson/strbuffer.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include "strbuffer.h"
+#include "util.h"
+
+#define STRBUFFER_MIN_SIZE  16
+#define STRBUFFER_FACTOR    2
+
+int strbuffer_init(strbuffer_t *strbuff)
+{
+    strbuff->size = STRBUFFER_MIN_SIZE;
+    strbuff->length = 0;
+
+    strbuff->value = malloc(strbuff->size);
+    if(!strbuff->value)
+        return -1;
+
+    /* initialize to empty */
+    strbuff->value[0] = '\0';
+    return 0;
+}
+
+void strbuffer_close(strbuffer_t *strbuff)
+{
+    free(strbuff->value);
+    strbuff->size = 0;
+    strbuff->length = 0;
+    strbuff->value = NULL;
+}
+
+void strbuffer_clear(strbuffer_t *strbuff)
+{
+    strbuff->length = 0;
+    strbuff->value[0] = '\0';
+}
+
+const char *strbuffer_value(const strbuffer_t *strbuff)
+{
+    return strbuff->value;
+}
+
+char *strbuffer_steal_value(strbuffer_t *strbuff)
+{
+    char *result = strbuff->value;
+    strbuffer_init(strbuff);
+    return result;
+}
+
+int strbuffer_append(strbuffer_t *strbuff, const char *string)
+{
+    return strbuffer_append_bytes(strbuff, string, strlen(string));
+}
+
+int strbuffer_append_byte(strbuffer_t *strbuff, char byte)
+{
+    return strbuffer_append_bytes(strbuff, &byte, 1);
+}
+
+int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size)
+{
+    if(strbuff->length + size >= strbuff->size)
+    {
+        strbuff->size = max(strbuff->size * STRBUFFER_FACTOR,
+                            strbuff->length + size + 1);
+
+        strbuff->value = realloc(strbuff->value, strbuff->size);
+        if(!strbuff->value)
+            return -1;
+    }
+
+    memcpy(strbuff->value + strbuff->length, data, size);
+    strbuff->length += size;
+    strbuff->value[strbuff->length] = '\0';
+
+    return 0;
+}
+
+char strbuffer_pop(strbuffer_t *strbuff)
+{
+    if(strbuff->length > 0) {
+        char c = strbuff->value[--strbuff->length];
+        strbuff->value[strbuff->length] = '\0';
+        return c;
+    }
+    else
+        return '\0';
+}
diff --git a/compat/jansson/strbuffer.h b/compat/jansson/strbuffer.h
new file mode 100644
index 0000000..816594a
--- /dev/null
+++ b/compat/jansson/strbuffer.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef STRBUFFER_H
+#define STRBUFFER_H
+
+typedef struct {
+    char *value;
+    int length;   /* bytes used */
+    int size;     /* bytes allocated */
+} strbuffer_t;
+
+int strbuffer_init(strbuffer_t *strbuff);
+void strbuffer_close(strbuffer_t *strbuff);
+
+void strbuffer_clear(strbuffer_t *strbuff);
+
+const char *strbuffer_value(const strbuffer_t *strbuff);
+char *strbuffer_steal_value(strbuffer_t *strbuff);
+
+int strbuffer_append(strbuffer_t *strbuff, const char *string);
+int strbuffer_append_byte(strbuffer_t *strbuff, char byte);
+int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size);
+
+char strbuffer_pop(strbuffer_t *strbuff);
+
+#endif
diff --git a/compat/jansson/utf.c b/compat/jansson/utf.c
new file mode 100644
index 0000000..2b64450
--- /dev/null
+++ b/compat/jansson/utf.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#include <string.h>
+#include "utf.h"
+
+int utf8_encode(int32_t codepoint, char *buffer, int *size)
+{
+    if(codepoint < 0)
+        return -1;
+    else if(codepoint < 0x80)
+    {
+        buffer[0] = (char)codepoint;
+        *size = 1;
+    }
+    else if(codepoint < 0x800)
+    {
+        buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
+        buffer[1] = 0x80 + ((codepoint & 0x03F));
+        *size = 2;
+    }
+    else if(codepoint < 0x10000)
+    {
+        buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
+        buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
+        buffer[2] = 0x80 + ((codepoint & 0x003F));
+        *size = 3;
+    }
+    else if(codepoint <= 0x10FFFF)
+    {
+        buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+        buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
+        buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
+        buffer[3] = 0x80 + ((codepoint & 0x00003F));
+        *size = 4;
+    }
+    else
+        return -1;
+
+    return 0;
+}
+
+int utf8_check_first(char byte)
+{
+    unsigned char u = (unsigned char)byte;
+
+    if(u < 0x80)
+        return 1;
+
+    if(0x80 <= u && u <= 0xBF) {
+        /* second, third or fourth byte of a multi-byte
+           sequence, i.e. a "continuation byte" */
+        return 0;
+    }
+    else if(u == 0xC0 || u == 0xC1) {
+        /* overlong encoding of an ASCII byte */
+        return 0;
+    }
+    else if(0xC2 <= u && u <= 0xDF) {
+        /* 2-byte sequence */
+        return 2;
+    }
+
+    else if(0xE0 <= u && u <= 0xEF) {
+        /* 3-byte sequence */
+        return 3;
+    }
+    else if(0xF0 <= u && u <= 0xF4) {
+        /* 4-byte sequence */
+        return 4;
+    }
+    else { /* u >= 0xF5 */
+        /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
+           UTF-8 */
+        return 0;
+    }
+}
+
+int utf8_check_full(const char *buffer, int size, int32_t *codepoint)
+{
+    int i;
+    int32_t value = 0;
+    unsigned char u = (unsigned char)buffer[0];
+
+    if(size == 2)
+    {
+        value = u & 0x1F;
+    }
+    else if(size == 3)
+    {
+        value = u & 0xF;
+    }
+    else if(size == 4)
+    {
+        value = u & 0x7;
+    }
+    else
+        return 0;
+
+    for(i = 1; i < size; i++)
+    {
+        u = (unsigned char)buffer[i];
+
+        if(u < 0x80 || u > 0xBF) {
+            /* not a continuation byte */
+            return 0;
+        }
+
+        value = (value << 6) + (u & 0x3F);
+    }
+
+    if(value > 0x10FFFF) {
+        /* not in Unicode range */
+        return 0;
+    }
+
+    else if(0xD800 <= value && value <= 0xDFFF) {
+        /* invalid code point (UTF-16 surrogate halves) */
+        return 0;
+    }
+
+    else if((size == 2 && value < 0x80) ||
+            (size == 3 && value < 0x800) ||
+            (size == 4 && value < 0x10000)) {
+        /* overlong encoding */
+        return 0;
+    }
+
+    if(codepoint)
+        *codepoint = value;
+
+    return 1;
+}
+
+const char *utf8_iterate(const char *buffer, int32_t *codepoint)
+{
+    int count;
+    int32_t value;
+
+    if(!*buffer)
+        return buffer;
+
+    count = utf8_check_first(buffer[0]);
+    if(count <= 0)
+        return NULL;
+
+    if(count == 1)
+        value = (unsigned char)buffer[0];
+    else
+    {
+        if(!utf8_check_full(buffer, count, &value))
+            return NULL;
+    }
+
+    if(codepoint)
+        *codepoint = value;
+
+    return buffer + count;
+}
+
+int utf8_check_string(const char *string, int length)
+{
+    int i;
+
+    if(length == -1)
+        length = strlen(string);
+
+    for(i = 0; i < length; i++)
+    {
+        int count = utf8_check_first(string[i]);
+        if(count == 0)
+            return 0;
+        else if(count > 1)
+        {
+            if(i + count > length)
+                return 0;
+
+            if(!utf8_check_full(&string[i], count, NULL))
+                return 0;
+
+            i += count - 1;
+        }
+    }
+
+    return 1;
+}
diff --git a/compat/jansson/utf.h b/compat/jansson/utf.h
new file mode 100644
index 0000000..8e95296
--- /dev/null
+++ b/compat/jansson/utf.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef UTF_H
+#define UTF_H
+
+#include <config.h>
+
+#ifdef HAVE_INTTYPES_H
+/* inttypes.h includes stdint.h in a standard environment, so there's
+no need to include stdint.h separately. If inttypes.h doesn't define
+int32_t, it's defined in config.h. */
+#include <inttypes.h>
+#endif
+
+int utf8_encode(int codepoint, char *buffer, int *size);
+
+int utf8_check_first(char byte);
+int utf8_check_full(const char *buffer, int size, int32_t *codepoint);
+const char *utf8_iterate(const char *buffer, int32_t *codepoint);
+
+int utf8_check_string(const char *string, int length);
+
+#endif
diff --git a/compat/jansson/util.h b/compat/jansson/util.h
new file mode 100644
index 0000000..33e5d62
--- /dev/null
+++ b/compat/jansson/util.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#ifndef max
+#define max(a, b)  ((a) > (b) ? (a) : (b))
+#endif
+
+#endif
diff --git a/compat/jansson/value.c b/compat/jansson/value.c
new file mode 100644
index 0000000..591b89e
--- /dev/null
+++ b/compat/jansson/value.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#define _GNU_SOURCE
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <jansson.h>
+#include "hashtable.h"
+#include "jansson_private.h"
+#include "utf.h"
+#include "util.h"
+
+
+static inline void json_init(json_t *json, json_type type)
+{
+    json->type = type;
+    json->refcount = 1;
+}
+
+
+/*** object ***/
+
+/* This macro just returns a pointer that's a few bytes backwards from
+   string. This makes it possible to pass a pointer to object_key_t
+   when only the string inside it is used, without actually creating
+   an object_key_t instance. */
+#define string_to_key(string)  container_of(string, object_key_t, key)
+
+static unsigned int hash_key(const void *ptr)
+{
+    const char *str = ((const object_key_t *)ptr)->key;
+
+    unsigned int hash = 5381;
+    unsigned int c;
+
+    while((c = (unsigned int)*str))
+    {
+        hash = ((hash << 5) + hash) + c;
+        str++;
+    }
+
+    return hash;
+}
+
+static int key_equal(const void *ptr1, const void *ptr2)
+{
+    return strcmp(((const object_key_t *)ptr1)->key,
+                  ((const object_key_t *)ptr2)->key) == 0;
+}
+
+static void value_decref(void *value)
+{
+    json_decref((json_t *)value);
+}
+
+json_t *json_object(void)
+{
+    json_object_t *object = malloc(sizeof(json_object_t));
+    if(!object)
+        return NULL;
+    json_init(&object->json, JSON_OBJECT);
+
+    if(hashtable_init(&object->hashtable, hash_key, key_equal,
+                      free, value_decref))
+    {
+        free(object);
+        return NULL;
+    }
+
+    object->serial = 0;
+    object->visited = 0;
+
+    return &object->json;
+}
+
+static void json_delete_object(json_object_t *object)
+{
+    hashtable_close(&object->hashtable);
+    free(object);
+}
+
+unsigned int json_object_size(const json_t *json)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json))
+        return -1;
+
+    object = json_to_object(json);
+    return object->hashtable.size;
+}
+
+json_t *json_object_get(const json_t *json, const char *key)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json))
+        return NULL;
+
+    object = json_to_object(json);
+    return hashtable_get(&object->hashtable, string_to_key(key));
+}
+
+int json_object_set_new_nocheck(json_t *json, const char *key, json_t *value)
+{
+    json_object_t *object;
+    object_key_t *k;
+
+    if(!key || !value)
+        return -1;
+
+    if(!json_is_object(json) || json == value)
+    {
+        json_decref(value);
+        return -1;
+    }
+    object = json_to_object(json);
+
+    k = malloc(sizeof(object_key_t) + strlen(key) + 1);
+    if(!k)
+        return -1;
+
+    k->serial = object->serial++;
+    strcpy(k->key, key);
+
+    if(hashtable_set(&object->hashtable, k, value))
+    {
+        json_decref(value);
+        return -1;
+    }
+
+    return 0;
+}
+
+int json_object_set_new(json_t *json, const char *key, json_t *value)
+{
+    if(!key || !utf8_check_string(key, -1))
+    {
+        json_decref(value);
+        return -1;
+    }
+
+    return json_object_set_new_nocheck(json, key, value);
+}
+
+int json_object_del(json_t *json, const char *key)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json))
+        return -1;
+
+    object = json_to_object(json);
+    return hashtable_del(&object->hashtable, string_to_key(key));
+}
+
+int json_object_clear(json_t *json)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json))
+        return -1;
+
+    object = json_to_object(json);
+    hashtable_clear(&object->hashtable);
+
+    return 0;
+}
+
+int json_object_update(json_t *object, json_t *other)
+{
+    void *iter;
+
+    if(!json_is_object(object) || !json_is_object(other))
+        return -1;
+
+    iter = json_object_iter(other);
+    while(iter) {
+        const char *key;
+        json_t *value;
+
+        key = json_object_iter_key(iter);
+        value = json_object_iter_value(iter);
+
+        if(json_object_set_nocheck(object, key, value))
+            return -1;
+
+        iter = json_object_iter_next(other, iter);
+    }
+
+    return 0;
+}
+
+void *json_object_iter(json_t *json)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json))
+        return NULL;
+
+    object = json_to_object(json);
+    return hashtable_iter(&object->hashtable);
+}
+
+void *json_object_iter_at(json_t *json, const char *key)
+{
+    json_object_t *object;
+
+    if(!key || !json_is_object(json))
+        return NULL;
+
+    object = json_to_object(json);
+    return hashtable_iter_at(&object->hashtable, string_to_key(key));
+}
+
+void *json_object_iter_next(json_t *json, void *iter)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json) || iter == NULL)
+        return NULL;
+
+    object = json_to_object(json);
+    return hashtable_iter_next(&object->hashtable, iter);
+}
+
+const object_key_t *jsonp_object_iter_fullkey(void *iter)
+{
+    if(!iter)
+        return NULL;
+
+    return hashtable_iter_key(iter);
+}
+
+const char *json_object_iter_key(void *iter)
+{
+    if(!iter)
+        return NULL;
+
+    return jsonp_object_iter_fullkey(iter)->key;
+}
+
+json_t *json_object_iter_value(void *iter)
+{
+    if(!iter)
+        return NULL;
+
+    return (json_t *)hashtable_iter_value(iter);
+}
+
+int json_object_iter_set_new(json_t *json, void *iter, json_t *value)
+{
+    json_object_t *object;
+
+    if(!json_is_object(json) || !iter || !value)
+        return -1;
+
+    object = json_to_object(json);
+    hashtable_iter_set(&object->hashtable, iter, value);
+
+    return 0;
+}
+
+static int json_object_equal(json_t *object1, json_t *object2)
+{
+    void *iter;
+
+    if(json_object_size(object1) != json_object_size(object2))
+        return 0;
+
+    iter = json_object_iter(object1);
+    while(iter)
+    {
+        const char *key;
+        json_t *value1, *value2;
+
+        key = json_object_iter_key(iter);
+        value1 = json_object_iter_value(iter);
+        value2 = json_object_get(object2, key);
+
+        if(!json_equal(value1, value2))
+            return 0;
+
+        iter = json_object_iter_next(object1, iter);
+    }
+
+    return 1;
+}
+
+static json_t *json_object_copy(json_t *object)
+{
+    json_t *result;
+    void *iter;
+
+    result = json_object();
+    if(!result)
+        return NULL;
+
+    iter = json_object_iter(object);
+    while(iter)
+    {
+        const char *key;
+        json_t *value;
+
+        key = json_object_iter_key(iter);
+        value = json_object_iter_value(iter);
+        json_object_set_nocheck(result, key, value);
+
+        iter = json_object_iter_next(object, iter);
+    }
+
+    return result;
+}
+
+static json_t *json_object_deep_copy(json_t *object)
+{
+    json_t *result;
+    void *iter;
+
+    result = json_object();
+    if(!result)
+        return NULL;
+
+    iter = json_object_iter(object);
+    while(iter)
+    {
+        const char *key;
+        json_t *value;
+
+        key = json_object_iter_key(iter);
+        value = json_object_iter_value(iter);
+        json_object_set_new_nocheck(result, key, json_deep_copy(value));
+
+        iter = json_object_iter_next(object, iter);
+    }
+
+    return result;
+}
+
+
+/*** array ***/
+
+json_t *json_array(void)
+{
+    json_array_t *array = malloc(sizeof(json_array_t));
+    if(!array)
+        return NULL;
+    json_init(&array->json, JSON_ARRAY);
+
+    array->entries = 0;
+    array->size = 8;
+
+    array->table = malloc(array->size * sizeof(json_t *));
+    if(!array->table) {
+        free(array);
+        return NULL;
+    }
+
+    array->visited = 0;
+
+    return &array->json;
+}
+
+static void json_delete_array(json_array_t *array)
+{
+    unsigned int i;
+
+    for(i = 0; i < array->entries; i++)
+        json_decref(array->table[i]);
+
+    free(array->table);
+    free(array);
+}
+
+unsigned int json_array_size(const json_t *json)
+{
+    if(!json_is_array(json))
+        return 0;
+
+    return json_to_array(json)->entries;
+}
+
+json_t *json_array_get(const json_t *json, unsigned int index)
+{
+    json_array_t *array;
+    if(!json_is_array(json))
+        return NULL;
+    array = json_to_array(json);
+
+    if(index >= array->entries)
+        return NULL;
+
+    return array->table[index];
+}
+
+int json_array_set_new(json_t *json, unsigned int index, json_t *value)
+{
+    json_array_t *array;
+
+    if(!value)
+        return -1;
+
+    if(!json_is_array(json) || json == value)
+    {
+        json_decref(value);
+        return -1;
+    }
+    array = json_to_array(json);
+
+    if(index >= array->entries)
+    {
+        json_decref(value);
+        return -1;
+    }
+
+    json_decref(array->table[index]);
+    array->table[index] = value;
+
+    return 0;
+}
+
+static void array_move(json_array_t *array, unsigned int dest,
+                       unsigned int src, unsigned int count)
+{
+    memmove(&array->table[dest], &array->table[src], count * sizeof(json_t *));
+}
+
+static void array_copy(json_t **dest, unsigned int dpos,
+                       json_t **src, unsigned int spos,
+                       unsigned int count)
+{
+    memcpy(&dest[dpos], &src[spos], count * sizeof(json_t *));
+}
+
+static json_t **json_array_grow(json_array_t *array,
+                                unsigned int amount,
+                                int copy)
+{
+    unsigned int new_size;
+    json_t **old_table, **new_table;
+
+    if(array->entries + amount <= array->size)
+        return array->table;
+
+    old_table = array->table;
+
+    new_size = max(array->size + amount, array->size * 2);
+    new_table = malloc(new_size * sizeof(json_t *));
+    if(!new_table)
+        return NULL;
+
+    array->size = new_size;
+    array->table = new_table;
+
+    if(copy) {
+        array_copy(array->table, 0, old_table, 0, array->entries);
+        free(old_table);
+        return array->table;
+    }
+
+    return old_table;
+}
+
+int json_array_append_new(json_t *json, json_t *value)
+{
+    json_array_t *array;
+
+    if(!value)
+        return -1;
+
+    if(!json_is_array(json) || json == value)
+    {
+        json_decref(value);
+        return -1;
+    }
+    array = json_to_array(json);
+
+    if(!json_array_grow(array, 1, 1)) {
+        json_decref(value);
+        return -1;
+    }
+
+    array->table[array->entries] = value;
+    array->entries++;
+
+    return 0;
+}
+
+int json_array_insert_new(json_t *json, unsigned int index, json_t *value)
+{
+    json_array_t *array;
+    json_t **old_table;
+
+    if(!value)
+        return -1;
+
+    if(!json_is_array(json) || json == value) {
+        json_decref(value);
+        return -1;
+    }
+    array = json_to_array(json);
+
+    if(index > array->entries) {
+        json_decref(value);
+        return -1;
+    }
+
+    old_table = json_array_grow(array, 1, 0);
+    if(!old_table) {
+        json_decref(value);
+        return -1;
+    }
+
+    if(old_table != array->table) {
+        array_copy(array->table, 0, old_table, 0, index);
+        array_copy(array->table, index + 1, old_table, index,
+                   array->entries - index);
+        free(old_table);
+    }
+    else
+        array_move(array, index + 1, index, array->entries - index);
+
+    array->table[index] = value;
+    array->entries++;
+
+    return 0;
+}
+
+int json_array_remove(json_t *json, unsigned int index)
+{
+    json_array_t *array;
+
+    if(!json_is_array(json))
+        return -1;
+    array = json_to_array(json);
+
+    if(index >= array->entries)
+        return -1;
+
+    json_decref(array->table[index]);
+
+    array_move(array, index, index + 1, array->entries - index);
+    array->entries--;
+
+    return 0;
+}
+
+int json_array_clear(json_t *json)
+{
+    json_array_t *array;
+    unsigned int i;
+
+    if(!json_is_array(json))
+        return -1;
+    array = json_to_array(json);
+
+    for(i = 0; i < array->entries; i++)
+        json_decref(array->table[i]);
+
+    array->entries = 0;
+    return 0;
+}
+
+int json_array_extend(json_t *json, json_t *other_json)
+{
+    json_array_t *array, *other;
+    unsigned int i;
+
+    if(!json_is_array(json) || !json_is_array(other_json))
+        return -1;
+    array = json_to_array(json);
+    other = json_to_array(other_json);
+
+    if(!json_array_grow(array, other->entries, 1))
+        return -1;
+
+    for(i = 0; i < other->entries; i++)
+        json_incref(other->table[i]);
+
+    array_copy(array->table, array->entries, other->table, 0, other->entries);
+
+    array->entries += other->entries;
+    return 0;
+}
+
+static int json_array_equal(json_t *array1, json_t *array2)
+{
+    unsigned int i, size;
+
+    size = json_array_size(array1);
+    if(size != json_array_size(array2))
+        return 0;
+
+    for(i = 0; i < size; i++)
+    {
+        json_t *value1, *value2;
+
+        value1 = json_array_get(array1, i);
+        value2 = json_array_get(array2, i);
+
+        if(!json_equal(value1, value2))
+            return 0;
+    }
+
+    return 1;
+}
+
+static json_t *json_array_copy(json_t *array)
+{
+    json_t *result;
+    unsigned int i;
+
+    result = json_array();
+    if(!result)
+        return NULL;
+
+    for(i = 0; i < json_array_size(array); i++)
+        json_array_append(result, json_array_get(array, i));
+
+    return result;
+}
+
+static json_t *json_array_deep_copy(json_t *array)
+{
+    json_t *result;
+    unsigned int i;
+
+    result = json_array();
+    if(!result)
+        return NULL;
+
+    for(i = 0; i < json_array_size(array); i++)
+        json_array_append_new(result, json_deep_copy(json_array_get(array, i)));
+
+    return result;
+}
+
+/*** string ***/
+
+json_t *json_string_nocheck(const char *value)
+{
+    json_string_t *string;
+
+    if(!value)
+        return NULL;
+
+    string = malloc(sizeof(json_string_t));
+    if(!string)
+        return NULL;
+    json_init(&string->json, JSON_STRING);
+
+    string->value = strdup(value);
+    if(!string->value) {
+        free(string);
+        return NULL;
+    }
+
+    return &string->json;
+}
+
+json_t *json_string(const char *value)
+{
+    if(!value || !utf8_check_string(value, -1))
+        return NULL;
+
+    return json_string_nocheck(value);
+}
+
+const char *json_string_value(const json_t *json)
+{
+    if(!json_is_string(json))
+        return NULL;
+
+    return json_to_string(json)->value;
+}
+
+int json_string_set_nocheck(json_t *json, const char *value)
+{
+    char *dup;
+    json_string_t *string;
+
+    dup = strdup(value);
+    if(!dup)
+        return -1;
+
+    string = json_to_string(json);
+    free(string->value);
+    string->value = dup;
+
+    return 0;
+}
+
+int json_string_set(json_t *json, const char *value)
+{
+    if(!value || !utf8_check_string(value, -1))
+        return -1;
+
+    return json_string_set_nocheck(json, value);
+}
+
+static void json_delete_string(json_string_t *string)
+{
+    free(string->value);
+    free(string);
+}
+
+static int json_string_equal(json_t *string1, json_t *string2)
+{
+    return strcmp(json_string_value(string1), json_string_value(string2)) == 0;
+}
+
+static json_t *json_string_copy(json_t *string)
+{
+    return json_string_nocheck(json_string_value(string));
+}
+
+
+/*** integer ***/
+
+json_t *json_integer(int value)
+{
+    json_integer_t *integer = malloc(sizeof(json_integer_t));
+    if(!integer)
+        return NULL;
+    json_init(&integer->json, JSON_INTEGER);
+
+    integer->value = value;
+    return &integer->json;
+}
+
+int json_integer_value(const json_t *json)
+{
+    if(!json_is_integer(json))
+        return 0;
+
+    return json_to_integer(json)->value;
+}
+
+int json_integer_set(json_t *json, int value)
+{
+    if(!json_is_integer(json))
+        return -1;
+
+    json_to_integer(json)->value = value;
+
+    return 0;
+}
+
+static void json_delete_integer(json_integer_t *integer)
+{
+    free(integer);
+}
+
+static int json_integer_equal(json_t *integer1, json_t *integer2)
+{
+    return json_integer_value(integer1) == json_integer_value(integer2);
+}
+
+static json_t *json_integer_copy(json_t *integer)
+{
+    return json_integer(json_integer_value(integer));
+}
+
+
+/*** real ***/
+
+json_t *json_real(double value)
+{
+    json_real_t *real = malloc(sizeof(json_real_t));
+    if(!real)
+        return NULL;
+    json_init(&real->json, JSON_REAL);
+
+    real->value = value;
+    return &real->json;
+}
+
+double json_real_value(const json_t *json)
+{
+    if(!json_is_real(json))
+        return 0;
+
+    return json_to_real(json)->value;
+}
+
+int json_real_set(json_t *json, double value)
+{
+    if(!json_is_real(json))
+        return 0;
+
+    json_to_real(json)->value = value;
+
+    return 0;
+}
+
+static void json_delete_real(json_real_t *real)
+{
+    free(real);
+}
+
+static int json_real_equal(json_t *real1, json_t *real2)
+{
+    return json_real_value(real1) == json_real_value(real2);
+}
+
+static json_t *json_real_copy(json_t *real)
+{
+    return json_real(json_real_value(real));
+}
+
+
+/*** number ***/
+
+double json_number_value(const json_t *json)
+{
+    if(json_is_integer(json))
+        return json_integer_value(json);
+    else if(json_is_real(json))
+        return json_real_value(json);
+    else
+        return 0.0;
+}
+
+
+/*** simple values ***/
+
+json_t *json_true(void)
+{
+    static json_t the_true = {
+        JSON_TRUE,
+        (unsigned int)-1
+    };
+    return &the_true;
+}
+
+
+json_t *json_false(void)
+{
+    static json_t the_false = {
+        JSON_FALSE,
+        (unsigned int)-1
+    };
+    return &the_false;
+}
+
+
+json_t *json_null(void)
+{
+    static json_t the_null = {
+        JSON_NULL,
+        (unsigned int)-1
+    };
+    return &the_null;
+}
+
+
+/*** deletion ***/
+
+void json_delete(json_t *json)
+{
+    if(json_is_object(json))
+        json_delete_object(json_to_object(json));
+
+    else if(json_is_array(json))
+        json_delete_array(json_to_array(json));
+
+    else if(json_is_string(json))
+        json_delete_string(json_to_string(json));
+
+    else if(json_is_integer(json))
+        json_delete_integer(json_to_integer(json));
+
+    else if(json_is_real(json))
+        json_delete_real(json_to_real(json));
+
+    /* json_delete is not called for true, false or null */
+}
+
+
+/*** equality ***/
+
+int json_equal(json_t *json1, json_t *json2)
+{
+    if(!json1 || !json2)
+        return 0;
+
+    if(json_typeof(json1) != json_typeof(json2))
+        return 0;
+
+    /* this covers true, false and null as they are singletons */
+    if(json1 == json2)
+        return 1;
+
+    if(json_is_object(json1))
+        return json_object_equal(json1, json2);
+
+    if(json_is_array(json1))
+        return json_array_equal(json1, json2);
+
+    if(json_is_string(json1))
+        return json_string_equal(json1, json2);
+
+    if(json_is_integer(json1))
+        return json_integer_equal(json1, json2);
+
+    if(json_is_real(json1))
+        return json_real_equal(json1, json2);
+
+    return 0;
+}
+
+
+/*** copying ***/
+
+json_t *json_copy(json_t *json)
+{
+    if(!json)
+        return NULL;
+
+    if(json_is_object(json))
+        return json_object_copy(json);
+
+    if(json_is_array(json))
+        return json_array_copy(json);
+
+    if(json_is_string(json))
+        return json_string_copy(json);
+
+    if(json_is_integer(json))
+        return json_integer_copy(json);
+
+    if(json_is_real(json))
+        return json_real_copy(json);
+
+    if(json_is_true(json) || json_is_false(json) || json_is_null(json))
+        return json;
+
+    return NULL;
+}
+
+json_t *json_deep_copy(json_t *json)
+{
+    if(!json)
+        return NULL;
+
+    if(json_is_object(json))
+        return json_object_deep_copy(json);
+
+    if(json_is_array(json))
+        return json_array_deep_copy(json);
+
+    /* for the rest of the types, deep copying doesn't differ from
+       shallow copying */
+
+    if(json_is_string(json))
+        return json_string_copy(json);
+
+    if(json_is_integer(json))
+        return json_integer_copy(json);
+
+    if(json_is_real(json))
+        return json_real_copy(json);
+
+    if(json_is_true(json) || json_is_false(json) || json_is_null(json))
+        return json;
+
+    return NULL;
+}
diff --git a/compat/stdbool.h b/compat/stdbool.h
new file mode 100644
index 0000000..31d0456
--- /dev/null
+++ b/compat/stdbool.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#define false   0
+#define true    1
+
+#define bool int
diff --git a/compat/sys/time.h b/compat/sys/time.h
new file mode 100644
index 0000000..0326e1d
--- /dev/null
+++ b/compat/sys/time.h
@@ -0,0 +1,11 @@
+#pragma once
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+int gettimeofday(struct timeval *tv, struct timezone *tz);
+void usleep(__int64 usec);
+#ifdef __cplusplus
+}
+#endif
+typedef __int64 useconds_t;
diff --git a/compat/thrust/CHANGELOG b/compat/thrust/CHANGELOG
new file mode 100644
index 0000000..110c668
--- /dev/null
+++ b/compat/thrust/CHANGELOG
@@ -0,0 +1,662 @@
+#######################################
+#           Thrust v1.7.0             #
+#######################################
+
+Summary
+    Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
+    well as several new algorithms and performance improvements. With this new
+    interface, users may directly control how algorithms execute as well as details
+    such as the allocation of temporary storage. Key/value versions of thrust::merge
+    and the set operation algorithms have been added, as well stencil versions of
+    partitioning algorithms. thrust::tabulate has been introduced to tabulate the
+    values of functions taking integers. For 32b types, new CUDA merge and set
+    operations provide 2-15x faster performance while a new CUDA comparison sort
+    provides 1.3-4x faster performance. Finally, a new TBB reduce_by_key implementation
+    provides 80% faster performance.
+
+Breaking API Changes
+    Dispatch
+      Custom user backend systems' tag types must now inherit from the corresponding system's execution_policy template (e.g. thrust::cuda::execution_policy) instead
+      of the tag struct (e.g. thrust::cuda::tag). Otherwise, algorithm specializations will silently go unfound during dispatch.
+      See examples/minimal_custom_backend.cu and examples/cuda/fallback_allocator.cu for usage examples.
+
+      thrust::advance and thrust::distance are no longer dispatched based on iterator system type and thus may no longer be customized.
+
+    Iterators
+      iterator_facade and iterator_adaptor's Pointer template parameters have been eliminated.
+      iterator_adaptor has been moved into the thrust namespace (previously thrust::experimental::iterator_adaptor).
+      iterator_facade has been moved into the thrust namespace (previously thrust::experimental::iterator_facade).
+      iterator_core_access has been moved into the thrust namespace (previously thrust::experimental::iterator_core_access).
+      All iterators' nested pointer typedef (the type of the result of operator->) is now void instead of a pointer type to indicate that such expressions are currently impossible.
+      Floating point counting_iterators' nested difference_type typedef is now a signed integral type instead of a floating point type.
+
+    Other
+      normal_distribution has been moved into the thrust::random namespace (previously thrust::random::experimental::normal_distribution).
+      Placeholder expressions may no longer include the comma operator.
+
+New Features
+    Execution Policies
+      Users may directly control the dispatch of algorithm invocations with optional execution policy arguments.
+      For example, instead of wrapping raw pointers allocated by cudaMalloc with thrust::device_ptr, the thrust::device execution_policy may be passed as an argument to an algorithm invocation to enable CUDA execution.
+      The following execution policies are supported in this version:
+
+        thrust::host
+        thrust::device
+        thrust::cpp::par
+        thrust::cuda::par
+        thrust::omp::par
+        thrust::tbb::par
+
+    Algorithms
+	free
+	get_temporary_buffer
+	malloc
+        merge_by_key
+        partition with stencil
+        partition_copy with stencil
+	return_temporary_buffer
+        set_difference_by_key
+        set_intersection_by_key
+        set_symmetric_difference_by_key
+        set_union_by_key
+        stable_partition with stencil
+        stable_partition_copy with stencil
+	tabulate
+
+New Examples
+    uninitialized_vector demonstrates how to use a custom allocator to avoid the automatic initialization of elements in thrust::device_vector.
+
+Other Enhancements
+    Authors of custom backend systems may manipulate arbitrary state during algorithm dispatch by incorporating it into their execution_policy parameter.
+    Users may control the allocation of temporary storage during algorithm execution by passing standard allocators as parameters via execution policies such as thrust::device.
+    THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the device backend. 
+    CUDA merge performance is 2-15x faster.
+    CUDA comparison sort performance is 1.3-4x faster.
+    CUDA set operation performance is 1.5-15x faster.
+    TBB reduce_by_key performance is 80% faster.
+    Several algorithms have been parallelized with TBB.
+    Support for user allocators in vectors has been improved.
+    The sparse_vector example is now implemented with merge_by_key instead of sort_by_key.
+    Warnings have been eliminated in various contexts.
+    Warnings about __host__ or __device__-only functions called from __host__ __device__ functions have been eliminated in various contexts.
+    Documentation about algorithm requirements have been improved.
+    Simplified the minimal_custom_backend example.
+    Simplified the cuda/custom_temporary_allocation example.
+    Simplified the cuda/fallback_allocator example.
+
+Bug Fixes
+    #248 fix broken counting_iterator<float> behavior with OpenMP
+    #231, #209 fix set operation failures with CUDA
+    #187 fix incorrect occupancy calculation with CUDA
+    #153 fix broken multigpu behavior with CUDA
+    #142 eliminate warning produced by thrust::random::taus88 and MSVC 2010
+    #208 correctly initialize elements in temporary storage when necessary
+    #16 fix compilation error when sorting bool with CUDA
+    #10 fix ambiguous overloads of reinterpret_tag
+
+Known Issues
+    g++ versions 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly causing infinite recursion in examples such as cuda/custom_temporary_allocation.
+
+Acknowledgments
+    Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing a faster merge implementation for CUDA.
+    Thanks to Sean Baxter for contributing a faster set operation implementation for CUDA.
+    Thanks to Cliff Woolley for contributing a correct occupancy calculation algorithm.
+
+#######################################
+#           Thrust v1.6.0             #
+#######################################
+
+Summary
+    Thrust v1.6.0 provides an interface for customization and extension and a new
+    backend system based on the Threading Building Blocks library. With this
+    new interface, programmers may customize the behavior of specific algorithms
+    as well as control the allocation of temporary storage or invent entirely new
+    backends. These enhancements also allow multiple different backend systems
+    such as CUDA and OpenMP to coexist within a single program. Support for TBB
+    allows Thrust programs to integrate more naturally into applications which
+    may already employ the TBB task scheduler.
+
+Breaking API Changes
+    The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to <thrust/system/cuda/experimental/pinned_allocator.h>
+    thrust::experimental::cuda::pinned_allocator has been moved to thrust::cuda::experimental::pinned_allocator
+    The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
+    The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
+    The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
+    thrust::host_space_tag has been renamed thrust::host_system_tag
+    thrust::device_space_tag has been renamed thrust::device_system_tag
+    thrust::any_space_tag has been renamed thrust::any_system_tag
+    thrust::iterator_space has been renamed thrust::iterator_system
+    
+
+New Features
+    Backend Systems
+        Threading Building Blocks (TBB) is now supported
+    Functions
+        for_each_n
+        raw_reference_cast
+    Types
+        pointer
+        reference
+
+New Examples
+    cuda/custom_temporary_allocation
+    cuda/fallback_allocator
+    device_ptr
+    expand
+    minimal_custom_backend
+    raw_reference_cast
+    set_operations
+
+Other Enhancements
+    thrust::for_each now returns the end of the input range similar to most other algorithms
+    thrust::pair and thrust::tuple have swap functionality
+    all CUDA algorithms now support large data types
+    iterators may be dereferenced in user __device__ or __global__ functions
+    the safe use of different backend systems is now possible within a single binary
+
+Bug Fixes
+    #469 min_element and max_element algorithms no longer require a const comparison operator
+
+Known Issues
+    cudafe++.exe may crash when parsing TBB headers on Windows. 
+
+#######################################
+#           Thrust v1.5.3             #
+#######################################
+
+Summary
+    Small bug fixes
+
+Bug Fixes
+    Avoid warnings about potential race due to __shared__ non-POD variable
+
+#######################################
+#           Thrust v1.5.2             #
+#######################################
+
+Summary
+    Small bug fixes
+
+Bug Fixes
+    Fixed warning about C-style initialization of structures
+
+#######################################
+#           Thrust v1.5.1             #
+#######################################
+
+Summary
+    Small bug fixes
+
+Bug Fixes
+    Sorting data referenced by permutation_iterators on CUDA produces invalid results
+
+#######################################
+#           Thrust v1.5.0             #
+#######################################
+
+Summary
+    Thrust v1.5.0 provides introduces new programmer productivity and performance
+    enhancements. New functionality for creating anonymous "lambda" functions has
+    been added. A faster host sort provides 2-10x faster performance for sorting
+    arithmetic types on (single-threaded) CPUs. A new OpenMP sort provides
+    2.5x-3.0x speedup over the host sort using a quad-core CPU. When sorting
+    arithmetic types with the OpenMP backend the combined performance improvement
+    is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to 14.2x
+    (8-bit types). A new CUDA reduce_by_key implementation provides 2-3x faster
+    performance.
+
+Breaking API Changes
+    device_ptr<void> no longer unsafely converts to device_ptr<T> without an
+    explicit cast. Use the expression
+    device_pointer_cast(static_cast<int*>(void_ptr.get()))
+    to convert, for example, device_ptr<void> to device_ptr<int>.
+
+New Features
+    Functions
+        stencil-less transform_if
+
+    Types
+        lambda placeholders
+
+New Examples
+    lambda
+
+Other Enhancements
+    host sort is 2-10x faster for arithmetic types
+    OMP sort provides speedup over host sort
+    reduce_by_key is 2-3x faster
+    reduce_by_key no longer requires O(N) temporary storage
+    CUDA scan algorithms are 10-40% faster
+    host_vector and device_vector are now documented
+    out-of-memory exceptions now provide detailed information from CUDART
+    improved histogram example
+    device_reference now has a specialized swap
+    reduce_by_key and scan algorithms are compatible with discard_iterator
+
+Removed Functionality
+
+Bug Fixes
+     #44 allow host_vector to compile when value_type uses __align__
+    #198 allow adjacent_difference to permit safe in-situ operation
+    #303 make thrust thread-safe
+    #313 avoid race conditions in device_vector::insert
+    #314 avoid unintended adl invocation when dispatching copy
+    #365 fix merge and set operation failures
+
+Known Issues
+    None
+
+Acknowledgments
+    Thanks to Manjunath Kudlur for contributing his Carbon library, from which the lambda functionality is derived.
+    Thanks to Jean-Francois Bastien for suggesting a fix for issue 303.
+
+#######################################
+#           Thrust v1.4.0             #
+#######################################
+
+Summary
+    Thrust v1.4.0 provides support for CUDA 4.0 in addition to many feature
+    and performance improvements.  New set theoretic algorithms operating on
+    sorted sequences have been added.  Additionally, a new fancy iterator
+    allows discarding redundant or otherwise unnecessary output from
+    algorithms, conserving memory storage and bandwidth.
+
+Breaking API Changes
+    Eliminations
+        thrust/is_sorted.h
+        thrust/utility.h
+        thrust/set_intersection.h
+        thrust/experimental/cuda/ogl_interop_allocator.h and the functionality therein
+        thrust::deprecated::copy_when
+        thrust::deprecated::absolute_value
+
+New Features
+    Functions
+        copy_n
+        merge
+        set_difference
+        set_symmetric_difference
+        set_union
+
+    Types
+        discard_iterator
+
+    Device support
+        Compute Capability 2.1 GPUs
+
+New Examples
+    run_length_decoding
+
+Other Enhancements
+    Compilation warnings are substantially reduced in various contexts.
+    The compilation time of thrust::sort, thrust::stable_sort, thrust::sort_by_key,
+    and thrust::stable_sort_by_key are substantially reduced.
+    A fast sort implementation is used when sorting primitive types with thrust::greater.
+    The performance of thrust::set_intersection is improved.
+    The performance of thrust::fill is improved on SM 1.x devices.
+    A code example is now provided in each algorithm's documentation.
+    thrust::reverse now operates in-place
+
+Removed Functionality
+    thrust::deprecated::copy_when
+    thrust::deprecated::absolute_value
+    thrust::experimental::cuda::ogl_interop_allocator
+    thrust::gather and thrust::scatter from host to device and vice versa are no longer supported.
+    Operations which modify the elements of a thrust::device_vector are no longer
+    available from source code compiled without nvcc when the device backend is CUDA.
+    Instead, use the idiom from the cpp_interop example.
+
+Bug Fixes
+    #212 set_intersection works correctly for large input sizes.
+    #275 counting_iterator and constant_iterator work correctly with OpenMP as the
+    backend when compiling with optimization
+    #256 min and max correctly return their first argument as a tie-breaker
+    #248 NDEBUG is interpreted correctly
+
+Known Issues
+    nvcc may generate code containing warnings when compiling some Thrust algorithms.
+    When compiling with -arch=sm_1x, some Thrust algorithms may cause nvcc to issue
+    benign pointer advisories.
+    When compiling with -arch=sm_1x and -G, some Thrust algorithms may fail to execute correctly.
+    thrust::inclusive_scan, thrust::exclusive_scan, thrust::inclusive_scan_by_key,
+    and thrust::exclusive_scan_by_key are currently incompatible with thrust::discard_iterator.
+
+Acknowledgments
+    Thanks to David Tarjan for improving the performance of set_intersection.
+    Thanks to Duane Merrill for continued help with sort.
+    Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
+
+#######################################
+#           Thrust v1.3.0             #
+#######################################
+
+Summary
+    Thrust v1.3.0 provides support for CUDA 3.2 in addition to many feature
+    and performance enhancements.
+    
+    Performance of the sort and sort_by_key algorithms is improved by as much 
+    as 3x in certain situations.  The performance of stream compaction algorithms,
+    such as copy_if, is improved by as much as 2x.  Reduction performance is 
+    also improved, particularly for small input sizes.
+    
+    CUDA errors are now converted to runtime exceptions using the system_error
+    interface.  Combined with a debug mode, also new in v1.3, runtime errors
+    can be located with greater precision.
+
+    Lastly, a few header files have been consolidated or renamed for clarity.
+    See the deprecations section below for additional details.
+
+
+Breaking API Changes
+    Promotions
+        thrust::experimental::inclusive_segmented_scan has been renamed thrust::inclusive_scan_by_key and exposes a different interface
+        thrust::experimental::exclusive_segmented_scan has been renamed thrust::exclusive_scan_by_key and exposes a different interface
+        thrust::experimental::partition_copy has been renamed thrust::partition_copy and exposes a different interface
+        thrust::next::gather has been renamed thrust::gather
+        thrust::next::gather_if has been renamed thrust::gather_if
+        thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
+    Deprecations
+        thrust::copy_when has been renamed thrust::deprecated::copy_when
+        thrust::absolute_value has been renamed thrust::deprecated::absolute_value
+        The header thrust/set_intersection.h is now deprecated; use thrust/set_operations.h instead
+        The header thrust/utility.h is now deprecated; use thrust/swap.h instead
+        The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
+    Eliminations
+        thrust::deprecated::gather
+        thrust::deprecated::gather_if
+        thrust/experimental/arch.h and the functions therein
+        thrust/sorting/merge_sort.h
+        thrust/sorting/radix_sort.h
+
+New Features
+    Functions
+        exclusive_scan_by_key
+        find
+        find_if
+        find_if_not
+        inclusive_scan_by_key
+        is_partitioned
+        is_sorted_until
+        mismatch
+        partition_point
+        reverse
+        reverse_copy
+        stable_partition_copy
+
+    Types
+        system_error and related types
+        experimental::cuda::ogl_interop_allocator
+        bit_and, bit_or, and bit_xor
+
+    Device support
+        gf104-based GPUs
+
+New Examples
+    opengl_interop.cu
+    repeated_range.cu
+    simple_moving_average.cu
+    sparse_vector.cu
+    strided_range.cu
+
+Other Enhancements
+    Performance of thrust::sort and thrust::sort_by_key is substantially improved for primitive key types
+    Performance of thrust::copy_if is substantially improved
+    Performance of thrust::reduce and related reductions is improved
+    THRUST_DEBUG mode added
+    Callers of Thrust functions may detect error conditions by catching thrust::system_error, which derives from std::runtime_error
+    The number of compiler warnings generated by Thrust has been substantially reduced
+    Comparison sort now works correctly for input sizes > 32M
+    min & max usage no longer collides with <windows.h> definitions
+    Compiling against the OpenMP backend no longer requires nvcc
+    Performance of device_vector initialized in .cpp files is substantially improved in common cases
+    Performance of thrust::sort_by_key on the host is substantially improved
+
+Removed Functionality
+    nvcc 2.3 is no longer supported
+
+Bug Fixes
+    Debug device code now compiles correctly
+    thrust::uninitialized_copy and thrust::unintialized_fill now dispatch constructors on the device rather than the host
+
+Known Issues
+    #212 set_intersection is known to fail for large input sizes
+    partition_point is known to fail for 64b types with nvcc 3.2
+
+Acknowledgments
+    Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
+    Thanks to Erich Elsen for contributing an implementation of find_if
+    Thanks to Andrew Corrigan for contributing changes which allow the OpenMP backend to compile in the absence of nvcc
+    Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for bug reports
+    Thanks to Cliff Woolley for help with testing
+
+#######################################
+#           Thrust v1.2.1             #
+#######################################
+
+Summary
+    Small fixes for compatibility with CUDA 3.1
+
+Known Issues
+    inclusive_scan & exclusive_scan may fail with very large types
+    the Microsoft compiler may fail to compile code using both sort and binary search algorithms
+    uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
+    # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
+    default_random_engine::discard is not accelerated with nvcc 2.3
+    nvcc 3.1 may fail to compile code using types derived from thrust::subtract_with_carry_engine, such as thrust::ranlux24 & thrust::ranlux48.
+
+#######################################
+#           Thrust v1.2.0             #
+#######################################
+
+Summary
+    Thrust v1.2 introduces support for compilation to multicore CPUs
+    and the Ocelot virtual machine, and several new facilities for
+    pseudo-random number generation.  New algorithms such as set
+    intersection and segmented reduction have also been added.  Lastly,
+    improvements to the robustness of the CUDA backend ensure
+    correctness across a broad set of (uncommon) use cases.
+
+Breaking API Changes
+    thrust::gather's interface was incorrect and has been removed.
+    The old interface is deprecated but will be preserved for Thrust
+    version 1.2 at thrust::deprecated::gather &
+    thrust::deprecated::gather_if. The new interface is provided at
+    thrust::next::gather & thrust::next::gather_if.  The new interface
+    will be promoted to thrust:: in Thrust version 1.3. For more details,
+    please refer to this thread:
+    http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd
+
+    The thrust::sorting namespace has been deprecated in favor of the
+    top-level sorting functions, such as thrust::sort() and
+    thrust::sort_by_key().
+
+New Features
+    Functions
+        reduce_by_key
+        set_intersection
+        tie
+        unique_copy
+        unique_by_key
+        unique_copy_by_key
+
+    Types
+        Random Number Generation
+            discard_block_engine
+            default_random_engine
+            linear_congruential_engine
+            linear_feedback_shift_engine
+            minstd_rand
+            minstd_rand0
+            normal_distribution (experimental)
+            ranlux24
+            ranlux48
+            ranlux24_base
+            ranlux48_base
+            subtract_with_carry_engine
+            taus88
+            uniform_int_distribution
+            uniform_real_distribution
+            xor_combine_engine
+        Functionals
+            project1st
+            project2nd
+
+    Fancy Iterators
+        permutation_iterator
+        reverse_iterator
+
+    Device support
+        Add support for multicore CPUs via OpenMP
+        Add support for Fermi-class GPUs
+        Add support for Ocelot virtual machine
+
+New Examples
+    cpp_integration
+    histogram
+    mode
+    monte_carlo
+    monte_carlo_disjoint_sequences
+    padded_grid_reduction
+    permutation_iterator
+    row_sum
+    run_length_encoding
+    segmented_scan
+    stream_compaction
+    summary_statistics
+    transform_iterator
+    word_count
+
+Other Enhancements
+    vector functions operator!=, rbegin, crbegin, rend, crend, data, & shrink_to_fit
+    integer sorting performance is improved when max is large but (max - min) is small and when min is negative
+    performance of inclusive_scan() and exclusive_scan() is improved by 20-25% for primitive types
+    support for nvcc 3.0
+
+Removed Functionality
+    removed support for equal between host & device sequences
+    removed support for gather() and scatter() between host & device sequences
+
+Bug Fixes
+    # 8 cause a compiler error if the required compiler is not found rather than a mysterious error at link time
+    # 42 device_ptr & device_reference are classes rather than structs, eliminating warnings on certain platforms
+    # 46 gather & scatter handle any space iterators correctly
+    # 51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
+    # 52 avoid collisions with common user macros such as BLOCK_SIZE
+    # 62 provide better documentation for device_reference
+    # 68 allow built-in CUDA vector types to work with device_vector in pure C++ mode
+    # 102 eliminated a race condition in device_vector::erase
+    various compilation warnings eliminated
+
+Known Issues
+   inclusive_scan & exclusive_scan may fail with very large types
+   the Microsoft compiler may fail to compile code using both sort and binary search algorithms
+   uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
+   # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
+   default_random_engine::discard is not accelerated with nvcc 2.3
+
+Acknowledgments
+   Thanks to Gregory Diamos for contributing a CUDA implementation of set_intersection
+   Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit tests and examples against Ocelot
+   Thanks to Tom Bradley for contributing an implementation of normal_distribution
+   Thanks to Joseph Rhoads for contributing the example summary_statistics
+
+#######################################
+#           Thrust v1.1.1             #
+#######################################
+
+Summary
+    Small fixes for compatibility with CUDA 2.3a and Mac OSX Snow Leopard.
+
+#######################################
+#           Thrust v1.1.0             #
+#######################################
+
+Summary
+    Thrust v1.1 introduces fancy iterators, binary search functions, and
+    several specialized reduction functions.  Experimental support for
+    segmented scan has also been added.
+
+Breaking API Changes
+    counting_iterator has been moved into the thrust namespace (previously thrust::experimental)
+
+New Features
+    Functions
+        copy_if
+        lower_bound
+        upper_bound
+        vectorized lower_bound
+        vectorized upper_bound
+        equal_range
+        binary_search
+        vectorized binary_search
+        all_of
+        any_of
+        none_of
+        minmax_element
+        advance
+        inclusive_segmented_scan (experimental)
+        exclusive_segmented_scan (experimental)
+
+    Types
+        pair
+        tuple
+        device_malloc_allocator
+
+    Fancy Iterators
+        constant_iterator
+        counting_iterator
+        transform_iterator
+        zip_iterator
+
+New Examples
+    computing the maximum absolute difference between vectors
+    computing the bounding box of a two-dimensional point set
+    sorting multiple arrays together (lexicographical sorting)
+    constructing a summed area table
+    using zip_iterator to mimic an array of structs
+    using constant_iterator to increment array values
+
+Other Enhancements
+    added pinned memory allocator (experimental)
+    added more methods to host_vector & device_vector (issue #4)
+    added variant of remove_if with a stencil argument (issue #29)
+    scan and reduce use cudaFuncGetAttributes to determine grid size
+    exceptions are reported when temporary device arrays cannot be allocated 
+
+Bug Fixes
+     #5 make vector work for larger data types
+     #9 stable_partition_copy doesn't respect OutputIterator concept semantics
+    #10 scans should return OutputIterator
+    #16 make algorithms work for larger data types
+    #27 dispatch radix_sort even when comp=less<T> is explicitly provided
+
+Known Issues
+    Using functors with Thrust entry points may not compile on Mac OSX with gcc-4.0.1
+    uninitialized_copy & uninitialized_fill dispatch constructors on the host rather than the device.
+    inclusive_scan, inclusive_scan_by_key, exclusive_scan, and exclusive_scan_by_key may fail when used with large types with the CUDA 3.1 driver
+
+
+#######################################
+#           Thrust v1.0.0             #
+#######################################
+
+Breaking API changes
+    Rename top level namespace komrade to thrust.
+    Move partition_copy() & stable_partition_copy() into thrust::experimental namespace until we can easily provide the standard interface.
+    Rename range() to sequence() to avoid collision with Boost.Range.
+    Rename copy_if() to copy_when() due to semantic differences with C++0x copy_if().
+
+New Features
+    Add C++0x style cbegin() & cend() methods to host_vector & device_vector.
+    Add transform_if function.
+    Add stencil versions of replace_if() & replace_copy_if().
+    Allow counting_iterator to work with for_each().
+    Allow types with constructors in comparison sort & reduce.
+
+Other Enhancements
+    merge_sort and stable_merge_sort are now 2 to 5x faster when executed on the parallel device.
+
+Bug fixes
+    Workaround an issue where an incremented iterator causes nvcc to crash. (Komrade issue #6)
+    Fix an issue where const_iterators could not be passed to transform. (Komrade issue #7)
+
diff --git a/compat/thrust/adjacent_difference.h b/compat/thrust/adjacent_difference.h
new file mode 100644
index 0000000..772b5f9
--- /dev/null
+++ b/compat/thrust/adjacent_difference.h
@@ -0,0 +1,244 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Compute difference between consecutive elements of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations Transformations
+ *  \{
+ */
+
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
+ *  is assigned to <tt>\*(result + (i - first))</tt>.
+ *
+ *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
+ *  differences.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the difference between adjacent elements of a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin());
+ *
+ *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
+ *  <tt>\*(result + (i - first))</tt>.
+ *  
+ *  This version of \p adjacent_difference uses the binary function \p binary_op to
+ *  calculate differences.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_op The binary function used to compute differences.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the sum between adjacent elements of a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
+ *
+ *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
+ *  is assigned to <tt>\*(result + (i - first))</tt>.
+ *
+ *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
+ *  differences.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the difference between adjacent elements of a range.
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin());
+ *
+ *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template <typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+                                   OutputIterator result);
+
+/*! \p adjacent_difference calculates the differences of adjacent elements in the
+ *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
+ *  <tt>\*result</tt>, and, for each iterator \p i in the range
+ *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
+ *  <tt>\*(result + (i - first))</tt>.
+ *  
+ *  This version of \p adjacent_difference uses the binary function \p binary_op to
+ *  calculate differences.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_op The binary function used to compute differences.
+ *  \return The iterator <tt>result + (last - first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
+ *          useful for computing differences "in place".
+ *
+ *  The following code snippet demonstrates how to use \p adjacent_difference to compute
+ *  the sum between adjacent elements of a range.
+ *
+ *  \code
+ *  #include <thrust/adjacent_difference.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
+ *  thrust::device_vector<int> d_data(h_data, h_data + 8);
+ *  thrust::device_vector<int> d_result(8);
+ *
+ *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
+ *
+ *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see inclusive_scan
+ */
+template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+/*! \}
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/adjacent_difference.inl>
+
diff --git a/compat/thrust/advance.h b/compat/thrust/advance.h
new file mode 100644
index 0000000..e7f60b0
--- /dev/null
+++ b/compat/thrust/advance.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file advance.h
+ *  \brief Advance an iterator by a given distance.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \p advance(i, n) increments the iterator \p i by the distance \p n. 
+ *  If <tt>n > 0</tt> it is equivalent to executing <tt>++i</tt> \p n
+ *  times, and if <tt>n < 0</tt> it is equivalent to executing <tt>--i</tt>
+ *  \p n times. If <tt>n == 0</tt>, the call has no effect.
+ *
+ *  \param i The iterator to be advanced.
+ *  \param n The distance by which to advance the iterator.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type. 
+ *
+ *  \pre \p n shall be negative only for bidirectional and random access iterators.
+ *
+ *  The following code snippet demonstrates how to use \p advance to increment
+ *  an iterator a given number of times.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator iter = vec.begin();
+ *
+ *  thrust::advance(iter, 7);
+ *
+ *  // iter - vec.begin() == 7
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/advance.html
+ */
+template <typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n);
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/detail/advance.inl>
+
diff --git a/compat/thrust/binary_search.h b/compat/thrust/binary_search.h
new file mode 100644
index 0000000..d2ac5a6
--- /dev/null
+++ b/compat/thrust/binary_search.h
@@ -0,0 +1,1888 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Search for values in sorted ranges.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+    
+/*! \addtogroup algorithms
+ */
+
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \addtogroup binary_search Binary Search
+ *  \ingroup searching
+ *  \{
+ */
+
+
+//////////////////////   
+// Scalar Functions //
+//////////////////////
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin()
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 1
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 8); // returns input.begin() + 4
+ *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+ForwardIterator lower_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses function object \c comp for comparison 
+ * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
+ * such that, for every iterator \c j in <tt>[first, i)</tt>, 
+ * <tt>comp(*j, value)</tt> is \c true. 
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp);
+
+
+/*! \p lower_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the first position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p lower_bound uses function object \c comp for comparison 
+ * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
+ * such that, for every iterator \c j in <tt>[first, i)</tt>, 
+ * <tt>comp(*j, value)</tt> is \c true. 
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
+ *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
+ *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+ForwardIterator lower_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
+ * is \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelism:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8); // returns input.end()
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses <tt>operator<</tt> for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
+ * is \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(input.begin(), input.end(), 0); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 2); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 8); // returns input.end()
+ *  thrust::upper_bound(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+ForwardIterator upper_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value);
+
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses function object \c comp for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
+ * is \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
+ *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp);
+
+/*! \p upper_bound is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * Specifically, it returns the last position where value could be
+ * inserted without violating the ordering. This version of 
+ * \p upper_bound uses function object \c comp for comparison and returns
+ * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
+ * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
+ * is \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p upper_bound
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::upper_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
+ *  thrust::upper_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
+ *  thrust::upper_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
+ *  thrust::upper_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+ForwardIterator upper_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(input.begin(), input.end(), 0); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 1); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 2); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 3); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 8); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 9); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class LessThanComparable>
+bool binary_search(ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns false
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns true
+ *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. 
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  Specifically, this version returns \c true if and only if 
+ * there exists an iterator \c i in <tt>[first, last)</tt> such that 
+ * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::binary_search(input.begin(), input.end(), 0, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 1, thrust::less<int>()); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 2, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 3, thrust::less<int>()); // returns false
+ *  thrust::binary_search(input.begin(), input.end(), 8, thrust::less<int>()); // returns true
+ *  thrust::binary_search(input.begin(), input.end(), 9, thrust::less<int>()); // returns false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+bool binary_search(ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
+ * iterator in <tt>[first, last)</tt> such that, for every iterator 
+ * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
+ * For every iterator \c k in <tt>[i, j)</tt>, neither 
+ * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
+ * iterator in <tt>[first, last)</tt> such that, for every iterator 
+ * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
+ * For every iterator \c k in <tt>[i, j)</tt>, neither 
+ * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
+ * \c j is the furthermost iterator in <tt>[first, last)</tt> such
+ * that, for every iterator \c k in <tt>[first, last)</tt>, 
+ * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
+ * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
+ * <tt>comp(*k, value)</tt> is \c true.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp);
+
+
+/*! \p equal_range is a version of binary search: it attempts to find
+ * the element value in an ordered range <tt>[first, last)</tt>. The 
+ * value returned by \p equal_range is essentially a combination of
+ * the values returned by \p lower_bound and \p upper_bound: it returns
+ * a \p pair of iterators \c i and \c j such that \c i is the first
+ * position where value could be inserted without violating the 
+ * ordering and \c j is the last position where value could be inserted
+ * without violating the ordering. It follows that every element in the
+ * range <tt>[i, j)</tt> is equivalent to value, and that 
+ * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
+ * has this property. 
+ *
+ * This version of \p equal_range returns a \p pair of iterators 
+ * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
+ * <tt>[first, last)</tt> such that, for every iterator \c k in 
+ * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
+ * \c j is the furthermost iterator in <tt>[first, last)</tt> such
+ * that, for every iterator \c k in <tt>[first, last)</tt>, 
+ * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
+ * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
+ * <tt>comp(*k, value)</tt> is \c true.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param value The value to be searched.
+ *  \param comp The comparison operator.
+ *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam T is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal_range
+ *  to search for values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::equal_range(input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
+ *  thrust::equal_range(input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
+ *  thrust::equal_range(input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
+ *  thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class T, class StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp);
+
+
+/*! \addtogroup vectorized_binary_search Vectorized Searches
+ *  \ingroup binary_search
+ *  \{
+ */
+
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p lower_bound uses function object \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p lower_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p lower_bound uses function object \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::lower_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [0, 1, 1, 2, 4, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of last position where value could
+ * be inserted without violating the ordering.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of last position where value could
+ * be inserted without violating the ordering.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(),
+ *                      output.begin());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p upper_bound uses function object \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(thrust::device,
+ *                      input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p upper_bound is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * Specifically, it returns the index of first position where value could
+ * be inserted without violating the ordering.  This version of 
+ * \p upper_bound uses function object \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p lower_bound
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<unsigned int> output(6);
+ *
+ *  thrust::upper_bound(input.begin(), input.end(),
+ *                      values.begin(), values.end(), 
+ *                      output.begin(),
+ *                      thrust::less<int>());
+ *
+ *  // output is now [1, 1, 2, 2, 5, 5]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see \p lower_bound
+ *  \see \p equal_range
+ *  \see \p binary_search
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator result,
+                           StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(thrust::device,
+ *                        input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  This version of \p binary_search uses function object 
+ * \c comp for comparison.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(thrust::device,
+ *                        input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin(),
+ *                        thrust::less<T>());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result,
+                             StrictWeakOrdering comp);
+
+
+/*! \p binary_search is a vectorized version of binary search: for each 
+ * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
+ * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
+ * It returns \c true if an element that is equivalent to \c value 
+ * is present in <tt>[first, last)</tt> and \c false if no such element
+ * exists.  This version of \p binary_search uses function object 
+ * \c comp for comparison.
+ *
+ *  \param first The beginning of the ordered sequence.
+ *  \param last The end of the ordered sequence.
+ *  \param values_first The beginning of the search values sequence.
+ *  \param values_last The end of the search values sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param comp The comparison operator.
+ * 
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *                        and bool is convertible to \c OutputIterator's \c value_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p binary_search
+ *  to search for multiple values in a ordered range.
+ *
+ *  \code
+ *  #include <thrust/binary_search.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<int> input(5);
+ *
+ *  input[0] = 0;
+ *  input[1] = 2;
+ *  input[2] = 5;
+ *  input[3] = 7;
+ *  input[4] = 8;
+ *
+ *  thrust::device_vector<int> values(6);
+ *  values[0] = 0; 
+ *  values[1] = 1;
+ *  values[2] = 2;
+ *  values[3] = 3;
+ *  values[4] = 8;
+ *  values[5] = 9;
+ *
+ *  thrust::device_vector<bool> output(6);
+ *
+ *  thrust::binary_search(input.begin(), input.end(),
+ *                        values.begin(), values.end(),
+ *                        output.begin(),
+ *                        thrust::less<T>());
+ *
+ *  // output is now [true, false, true, false, true, false]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see \p lower_bound
+ *  \see \p upper_bound
+ *  \see \p equal_range
+ */
+template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator result,
+                             StrictWeakOrdering comp);
+
+
+/*! \} // end vectorized_binary_search
+ */
+
+
+/*! \} // end binary_search
+ */
+
+
+/*! \} // end searching
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/binary_search.inl>
+
diff --git a/compat/thrust/copy.h b/compat/thrust/copy.h
new file mode 100644
index 0000000..eaa9719
--- /dev/null
+++ b/compat/thrust/copy.h
@@ -0,0 +1,505 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file copy.h
+ *  \brief Copies elements from one range to another
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup copying
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p copy copies elements from the range [\p first, \p last) to the range
+ *  [\p result, \p result + (\p last - \p first)). That is, it performs
+ *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
+ *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
+ *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
+ *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
+ *  calling \p copy with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + (\p last - \p first).
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy.
+ *  \param last The end of the sequence to copy.
+ *  \param result The destination sequence.
+ *  \return The end of the destination sequence.
+ *  \see http://www.sgi.com/tech/stl/copy.html
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  thrust::device_vector<int> vec0(100);
+ *  thrust::device_vector<int> vec1(100);
+ *  ...
+ *
+ *  thrust::copy(thrust::device, vec0.begin(), vec0.end(), vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+
+/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
+ *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
+ *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
+ *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
+ *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
+ *  calling \p copy_n with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + \p n.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to copy.
+ *  \param n The number of elements to copy.
+ *  \param result The beginning destination range.
+ *  \return The end of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam Size is an integral type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  size_t n = 100;
+ *  thrust::device_vector<int> vec0(n);
+ *  thrust::device_vector<int> vec1(n);
+ *  ...
+ *  thrust::copy_n(thrust::device, vec0.begin(), n, vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see thrust::copy
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+	
+/*! \p copy copies elements from the range [\p first, \p last) to the range
+ *  [\p result, \p result + (\p last - \p first)). That is, it performs
+ *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
+ *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
+ *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
+ *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
+ *  calling \p copy with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + (\p last - \p first).
+ *
+ *  \param first The beginning of the sequence to copy.
+ *  \param last The end of the sequence to copy.
+ *  \param result The destination sequence.
+ *  \return The end of the destination sequence.
+ *  \see http://www.sgi.com/tech/stl/copy.html
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *
+ *  thrust::device_vector<int> vec0(100);
+ *  thrust::device_vector<int> vec1(100);
+ *  ...
+ *
+ *  thrust::copy(vec0.begin(), vec0.end(),
+ *               vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ */
+template<typename InputIterator, typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
+ *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
+ *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
+ *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
+ *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
+ *  calling \p copy_n with overlapping source and destination ranges has undefined
+ *  behavior.
+ *
+ *  The return value is \p result + \p n.
+ *
+ *  \param first The beginning of the range to copy.
+ *  \param n The number of elements to copy.
+ *  \param result The beginning destination range.
+ *  \return The end of the destination range.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam Size is an integral type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p copy
+ *  to copy from one range to another.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  size_t n = 100;
+ *  thrust::device_vector<int> vec0(n);
+ *  thrust::device_vector<int> vec1(n);
+ *  ...
+ *  thrust::copy_n(vec0.begin(), n, vec1.begin());
+ *
+ *  // vec1 is now a copy of vec0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see thrust::copy
+ */
+template<typename InputIterator, typename Size, typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+/*! \} // end copying
+ */
+
+/*! \addtogroup stream_compaction
+ *  \{
+ */
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \ presult, except that any element which causes \p pred
+ *  to be \p pred to be \c false is not copied.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  The algorithm's execution is parallelized as determined by \p system.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[first, last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy even numbers to an output range using the \p thrust::host parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(thrust::host, V, V + N, result, is_even());
+ *
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, 0, 0, 2}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \ presult, except that any element which causes \p pred
+ *  to be \p pred to be \c false is not copied.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[first, last)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy even numbers to an output range.
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(V, V + N, result, is_even());
+ *
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, 0, 0, 2}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c false is not copied.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy numbers to an output range when corresponding stencil elements are even using the \p thrust::host execution policy:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int N = 6;
+ *  int data[N]    = { 0, 1,  2, 3, 4, 5};
+ *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(thrust::host, data, data + N, stencil, result, is_even());
+ *
+ *  // data remains    = { 0, 1,  2, 3, 4, 5};
+ *  // stencil remains = {-2, 0, -1, 0, 1, 2};
+ *  // result is now     { 0, 1,  3, 5}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
+ *  to a range beginning at \p result, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c false is not copied.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
+ *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
+ *  occurs and \p result is not advanced.
+ *
+ *  \param first The beginning of the sequence from which to copy.
+ *  \param last The end of the sequence from which to copy.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence into which to copy.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
+ *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
+ *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
+ *  to copy numbers to an output range when corresponding stencil elements are even:
+ *
+ *  \code
+ *  #include <thrust/copy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int N = 6;
+ *  int data[N]    = { 0, 1,  2, 3, 4, 5};
+ *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[4];
+ *
+ *  thrust::copy_if(data, data + N, stencil, result, is_even());
+ *
+ *  // data remains    = { 0, 1,  2, 3, 4, 5};
+ *  // stencil remains = {-2, 0, -1, 0, 1, 2};
+ *  // result is now     { 0, 1,  3, 5}
+ *  \endcode
+ *
+ *  \see \c remove_copy_if
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+/*! \} // end stream_compaction
+ */
+	
+} // end namespace thrust
+
+#include <thrust/detail/copy.h>
+#include <thrust/detail/copy_if.h>
+
diff --git a/compat/thrust/count.h b/compat/thrust/count.h
new file mode 100644
index 0000000..cddd1dd
--- /dev/null
+++ b/compat/thrust/count.h
@@ -0,0 +1,231 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file count.h
+ *  \brief Counting elements in a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup reductions
+ *  \ingroup algorithms
+ *  \{
+ */
+
+/*! \addtogroup counting
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
+ *  to \p value. More precisely, \p count returns the number of iterators \c i in
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be counted.
+ *  \return The number of elements equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *
+ *  The following code snippet demonstrates how to use \p count to 
+ *  count the number of instances in a range of a value of interest using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // put 3 1s in a device_vector
+ *  thrust::device_vector<int> vec(5,0);
+ *  vec[1] = 1;
+ *  vec[3] = 1;
+ *  vec[4] = 1;
+ *  
+ *  // count the 1s
+ *  int result = thrust::count(thrust::device, vec.begin(), vec.end(), 1);
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
+
+
+
+/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
+ *  to \p value. More precisely, \p count returns the number of iterators \c i in
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be counted.
+ *  \return The number of elements equal to \p value.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *
+ *  The following code snippet demonstrates how to use \p count to 
+ *  count the number of instances in a range of a value of interest.
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // put 3 1s in a device_vector
+ *  thrust::device_vector<int> vec(5,0);
+ *  vec[1] = 1;
+ *  vec[3] = 1;
+ *  vec[4] = 1;
+ *  
+ *  // count the 1s
+ *  int result = thrust::count(vec.begin(), vec.end(), 1);
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template <typename InputIterator, typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(InputIterator first, InputIterator last, const EqualityComparable& value);
+
+
+/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
+ *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
+ *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param pred The predicate.
+ *  \return The number of elements where \p pred is \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p count to
+ *  count the number of odd numbers in a range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int &x)
+ *    {
+ *      return x & 1;
+ *    }
+ *  };
+ *  ...
+ *  // fill a device_vector with even & odd numbers
+ *  thrust::device_vector<int> vec(5);
+ *  vec[0] = 0;
+ *  vec[1] = 1;
+ *  vec[2] = 2;
+ *  vec[3] = 3;
+ *  vec[4] = 4;
+ *
+ *  // count the odd elements in vec
+ *  int result = thrust::count_if(thrust::device, vec.begin(), vec.end(), is_odd());
+ *  // result == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
+ *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
+ *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param pred The predicate.
+ *  \return The number of elements where \p pred is \c true.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p count to
+ *  count the number of odd numbers in a range.
+ *  \code
+ *  #include <thrust/count.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int &x)
+ *    {
+ *      return x & 1;
+ *    }
+ *  };
+ *  ...
+ *  // fill a device_vector with even & odd numbers
+ *  thrust::device_vector<int> vec(5);
+ *  vec[0] = 0;
+ *  vec[1] = 1;
+ *  vec[2] = 2;
+ *  vec[3] = 3;
+ *  vec[4] = 4;
+ *
+ *  // count the odd elements in vec
+ *  int result = thrust::count_if(vec.begin(), vec.end(), is_odd());
+ *  // result == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/count.html
+ */
+template <typename InputIterator, typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(InputIterator first, InputIterator last, Predicate pred);
+
+/*! \} // end counting
+ *  \} // end reductions
+ */
+
+} // end thrust
+
+#include <thrust/detail/count.inl>
+
diff --git a/compat/thrust/detail/adjacent_difference.inl b/compat/thrust/detail/adjacent_difference.inl
new file mode 100644
index 0000000..6590f9d
--- /dev/null
+++ b/compat/thrust/detail/adjacent_difference.inl
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.inl
+ *  \brief Inline file for adjacent_difference.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+#include <thrust/system/detail/adl/adjacent_difference.h>
+
+namespace thrust
+{
+
+
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::adjacent_difference;
+
+  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end adjacent_difference()
+
+
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last, 
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::adjacent_difference;
+
+  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
+} // end adjacent_difference()
+
+
+template <typename InputIterator, typename OutputIterator>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::adjacent_difference(select_system(system1, system2), first, last, result);
+} // end adjacent_difference()
+
+
+template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::adjacent_difference(select_system(system1, system2), first, last, result, binary_op);
+} // end adjacent_difference()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/advance.inl b/compat/thrust/detail/advance.inl
new file mode 100644
index 0000000..2907be7
--- /dev/null
+++ b/compat/thrust/detail/advance.inl
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file advance.inl
+ *  \brief Inline file for advance.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/advance.h>
+#include <thrust/system/detail/generic/advance.h>
+
+namespace thrust
+{
+
+
+template <typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n)
+{
+  thrust::system::detail::generic::advance(i, n);
+} // end advance()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/allocator/allocator_traits.h b/compat/thrust/detail/allocator/allocator_traits.h
new file mode 100644
index 0000000..6ee99b4
--- /dev/null
+++ b/compat/thrust/detail/allocator/allocator_traits.h
@@ -0,0 +1,240 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_pointer, pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_pointer, const_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_reference, reference)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_reference, const_reference)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_void_pointer, void_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_void_pointer, const_void_pointer)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_size_type, size_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_copy_assignment, propagate_on_container_copy_assignment)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_move_assignment, propagate_on_container_move_assignment)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_swap, propagate_on_container_swap)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_system_type, system_type)
+
+template<typename T>
+  struct nested_pointer
+{
+  typedef typename T::pointer type;
+};
+
+template<typename T>
+  struct nested_const_pointer
+{
+  typedef typename T::const_pointer type;
+};
+
+template<typename T>
+  struct nested_reference
+{
+  typedef typename T::reference type;
+};
+
+template<typename T>
+  struct nested_const_reference
+{
+  typedef typename T::const_reference type;
+};
+
+template<typename T>
+  struct nested_void_pointer
+{
+  typedef typename T::void_pointer type;
+};
+
+template<typename T>
+  struct nested_const_void_pointer
+{
+  typedef typename T::const_void_pointer type;
+};
+
+template<typename T>
+  struct nested_difference_type
+{
+  typedef typename T::difference_type type;
+};
+
+template<typename T>
+  struct nested_size_type
+{
+  typedef typename T::size_type type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_copy_assignment
+{
+  typedef typename T::propagate_on_container_copy_assignment type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_move_assignment
+{
+  typedef typename T::propagate_on_container_move_assignment type;
+};
+
+template<typename T>
+  struct nested_propagate_on_container_swap
+{
+  typedef typename T::propagate_on_container_swap type;
+};
+
+template<typename T>
+  struct nested_system_type
+{
+  typedef typename T::system_type type;
+};
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc>
+  struct allocator_traits
+{
+  typedef Alloc allocator_type;
+
+  typedef typename allocator_type::value_type value_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_pointer<allocator_type>,
+    identity_<value_type*>
+  >::type pointer;
+
+  private:
+    template<typename T>
+      struct rebind_pointer
+    {
+      typedef typename pointer_traits<pointer>::template rebind<T>::other type;
+    };
+
+  public:
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_const_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_const_pointer<allocator_type>,
+    rebind_pointer<const value_type>
+  >::type const_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_void_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_void_pointer<allocator_type>,
+    rebind_pointer<void>
+  >::type void_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_const_void_pointer<allocator_type>::value,
+    allocator_traits_detail::nested_const_void_pointer<allocator_type>,
+    rebind_pointer<const void>
+  >::type const_void_pointer;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_difference_type<allocator_type>::value,
+    allocator_traits_detail::nested_difference_type<allocator_type>,
+    pointer_difference<pointer>
+  >::type difference_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_size_type<allocator_type>::value,
+    allocator_traits_detail::nested_size_type<allocator_type>,
+    make_unsigned<difference_type>
+  >::type size_type;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_copy_assignment<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_copy_assignment<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_copy_assignment;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_move_assignment<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_move_assignment<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_move_assignment;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_propagate_on_container_swap<allocator_type>::value,
+    allocator_traits_detail::nested_propagate_on_container_swap<allocator_type>,
+    identity_<false_type>
+  >::type propagate_on_container_swap;
+
+  typedef typename eval_if<
+    allocator_traits_detail::has_system_type<allocator_type>::value,
+    allocator_traits_detail::nested_system_type<allocator_type>,
+    thrust::iterator_system<pointer>
+  >::type system_type;
+
+  // XXX rebind and rebind_traits are alias templates
+  //     and so are omitted while c++11 is unavailable
+
+  inline static pointer allocate(allocator_type &a, size_type n);
+
+  inline static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint);
+
+  inline static void deallocate(allocator_type &a, pointer p, size_type n);
+
+  // XXX should probably change T* to pointer below and then relax later
+
+  template<typename T>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p);
+  
+  template<typename T, typename Arg1>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p, const Arg1 &arg1);
+
+  template<typename T>
+  inline __host__ __device__ static void destroy(allocator_type &a, T *p);
+
+  inline static size_type max_size(const allocator_type &a);
+}; // end allocator_traits
+
+
+// XXX consider moving this non-standard functionality inside allocator_traits
+template<typename Alloc>
+  struct allocator_system
+{
+  // the type of the allocator's system
+  typedef typename eval_if<
+    allocator_traits_detail::has_system_type<Alloc>::value,
+    allocator_traits_detail::nested_system_type<Alloc>,
+    thrust::iterator_system<
+      typename allocator_traits<Alloc>::pointer
+    >
+  >::type type;
+
+  inline static type &get(Alloc &a);
+};
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/allocator_traits.inl>
+
diff --git a/compat/thrust/detail/allocator/allocator_traits.inl b/compat/thrust/detail/allocator/allocator_traits.inl
new file mode 100644
index 0000000..8319335
--- /dev/null
+++ b/compat/thrust/detail/allocator/allocator_traits.inl
@@ -0,0 +1,287 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/has_member_function.h>
+#include <thrust/detail/type_traits/is_call_possible.h>
+#include <new>
+#include <limits>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_allocate_with_hint_impl, allocate)
+
+template<typename Alloc>
+  class has_member_allocate_with_hint
+{
+  typedef typename allocator_traits<Alloc>::pointer            pointer;
+  typedef typename allocator_traits<Alloc>::size_type          size_type;
+  typedef typename allocator_traits<Alloc>::const_void_pointer const_void_pointer;
+
+  public:
+    typedef typename has_member_allocate_with_hint_impl<Alloc, pointer(size_type,const_void_pointer)>::type type;
+    static const bool value = type::value;
+};
+
+template<typename Alloc>
+  typename enable_if<
+    has_member_allocate_with_hint<Alloc>::value,
+    typename allocator_traits<Alloc>::pointer
+  >::type
+    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
+{
+  return a.allocate(n,hint);
+}
+
+template<typename Alloc>
+  typename disable_if<
+    has_member_allocate_with_hint<Alloc>::value,
+    typename allocator_traits<Alloc>::pointer
+  >::type
+    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer)
+{
+  return a.allocate(n);
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct1_impl, construct)
+
+template<typename Alloc, typename T>
+  struct has_member_construct1
+    : has_member_construct1_impl<Alloc, void(T*)>
+{};
+
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_construct1<Alloc,T>::value
+    >::type
+      construct(Alloc &a, T *p)
+{
+  a.construct(p);
+}
+
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_construct1<Alloc,T>::value
+    >::type
+      construct(Alloc &a, T *p)
+{
+  ::new(static_cast<void*>(p)) T();
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct2_impl, construct)
+
+template<typename Alloc, typename T, typename Arg1>
+  struct has_member_construct2
+    : has_member_construct2_impl<Alloc, void(T*,const Arg1 &)>
+{};
+
+template<typename Alloc, typename T, typename Arg1>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_construct2<Alloc,T,Arg1>::value
+    >::type
+      construct(Alloc &a, T *p, const Arg1 &arg1)
+{
+  a.construct(p,arg1);
+}
+
+template<typename Alloc, typename T, typename Arg1>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_construct2<Alloc,T,Arg1>::value
+    >::type
+      construct(Alloc &, T *p, const Arg1 &arg1)
+{
+  ::new(static_cast<void*>(p)) T(arg1);
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_destroy_impl, destroy)
+
+template<typename Alloc, typename T>
+  struct has_member_destroy
+    : has_member_destroy_impl<Alloc, void(T*)>
+{};
+
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_destroy<Alloc,T>::value
+    >::type
+      destroy(Alloc &a, T *p)
+{
+  a.destroy(p);
+}
+
+template<typename Alloc, typename T>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_destroy<Alloc,T>::value
+    >::type
+      destroy(Alloc &, T *p)
+{
+  p->~T();
+}
+
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_max_size_impl, max_size)
+
+template<typename Alloc>
+  class has_member_max_size
+{
+  typedef typename allocator_traits<Alloc>::size_type size_type;
+
+  public:
+    typedef typename has_member_max_size_impl<Alloc, size_type(void)>::type type;
+    static const bool value = type::value;
+};
+
+template<typename Alloc>
+  typename enable_if<
+    has_member_max_size<Alloc>::value,
+    typename allocator_traits<Alloc>::size_type
+  >::type
+    max_size(const Alloc &a)
+{
+  return a.max_size();
+}
+
+template<typename Alloc>
+  typename disable_if<
+    has_member_max_size<Alloc>::value,
+    typename allocator_traits<Alloc>::size_type
+  >::type
+    max_size(const Alloc &a)
+{
+  typedef typename allocator_traits<Alloc>::size_type size_type;
+  return std::numeric_limits<size_type>::max();
+}
+
+__THRUST_DEFINE_HAS_MEMBER_FUNCTION(has_member_system_impl, system)
+
+template<typename Alloc>
+  class has_member_system
+{
+  typedef typename allocator_system<Alloc>::type system_type;
+
+  public:
+    typedef typename has_member_system_impl<Alloc, system_type&(void)>::type type;
+    static const bool value = type::value;
+};
+
+template<typename Alloc>
+  typename enable_if<
+    has_member_system<Alloc>::value,
+    typename allocator_system<Alloc>::type &
+  >::type
+    system(Alloc &a)
+{
+  return a.system();
+}
+
+template<typename Alloc>
+  typename disable_if<
+    has_member_system<Alloc>::value,
+    typename allocator_system<Alloc>::type &
+  >::type
+    system(Alloc &a)
+{
+  // assumes the system is default-constructible
+  static typename allocator_system<Alloc>::type state;
+  return state;
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc>
+  typename allocator_traits<Alloc>::pointer
+    allocator_traits<Alloc>
+      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
+{
+  return a.allocate(n);
+}
+
+template<typename Alloc>
+  typename allocator_traits<Alloc>::pointer
+    allocator_traits<Alloc>
+      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
+{
+  return allocator_traits_detail::allocate(a, n, hint);
+}
+
+template<typename Alloc>
+  void allocator_traits<Alloc>
+    ::deallocate(Alloc &a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
+{
+  return a.deallocate(p,n);
+}
+
+template<typename Alloc>
+  template<typename T>
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p)
+{
+  return allocator_traits_detail::construct(a,p);
+}
+
+template<typename Alloc>
+  template<typename T, typename Arg1>
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p, const Arg1 &arg1)
+{
+  return allocator_traits_detail::construct(a,p,arg1);
+}
+
+template<typename Alloc>
+  template<typename T>
+    void allocator_traits<Alloc>
+      ::destroy(allocator_type &a, T *p)
+{
+  return allocator_traits_detail::destroy(a,p);
+}
+
+template<typename Alloc>
+  typename allocator_traits<Alloc>::size_type
+    allocator_traits<Alloc>
+      ::max_size(const allocator_type &a)
+{
+  return allocator_traits_detail::max_size(a);
+}
+
+template<typename Alloc>
+  typename allocator_system<Alloc>::type &
+    allocator_system<Alloc>
+      ::get(Alloc &a)
+{
+  return allocator_traits_detail::system(a);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/copy_construct_range.h b/compat/thrust/detail/allocator/copy_construct_range.h
new file mode 100644
index 0000000..5d99e1f
--- /dev/null
+++ b/compat/thrust/detail/allocator/copy_construct_range.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename System, typename Allocator, typename InputIterator, typename Pointer>
+  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
+                               Allocator &a,
+                               InputIterator first,
+                               InputIterator last,
+                               Pointer result);
+
+template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
+                                 Allocator &a,
+                                 InputIterator first,
+                                 Size n,
+                                 Pointer result);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/copy_construct_range.inl>
+
diff --git a/compat/thrust/detail/allocator/copy_construct_range.inl b/compat/thrust/detail/allocator/copy_construct_range.inl
new file mode 100644
index 0000000..7c5478b
--- /dev/null
+++ b/compat/thrust/detail/allocator/copy_construct_range.inl
@@ -0,0 +1,298 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/copy.h>
+#include <thrust/tuple.h>
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/for_each.h>
+#include <memory>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+template<typename Allocator, typename InputType, typename OutputType>
+  struct copy_construct_with_allocator
+{
+  Allocator &a;
+
+  copy_construct_with_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename Tuple>
+  inline __host__ __device__
+  void operator()(Tuple t)
+  {
+    const InputType &in = thrust::get<0>(t);
+    OutputType &out = thrust::get<1>(t);
+
+    allocator_traits<Allocator>::construct(a, &out, in);
+  }
+};
+
+
+template<typename Allocator, typename T>
+  struct needs_copy_construct_via_allocator
+    : has_member_construct2<
+        Allocator,
+        T,
+        T
+      >
+{};
+
+
+// we know that std::allocator::construct's only effect is to call T's
+// copy constructor, so we needn't use it for copy construction
+template<typename U, typename T>
+  struct needs_copy_construct_via_allocator<std::allocator<U>, T>
+    : thrust::detail::false_type
+{};
+
+
+// XXX it's regrettable that this implementation is copied almost
+//     exactly from system::detail::generic::uninitialized_copy
+//     perhaps generic::uninitialized_copy could call this routine
+//     with a default allocator
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
+  typename enable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator(Allocator &a,
+                                      thrust::execution_policy<FromSystem> &from_system,
+                                      thrust::execution_policy<ToSystem> &to_system,
+                                      InputIterator first,
+                                      InputIterator last,
+                                      Pointer result)
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+  ZipIterator end = begin;
+
+  // get a zip_iterator pointing to the end
+  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
+  thrust::advance(end,n);
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<Pointer>::value_type       OutputType;
+
+  // do the for_each
+  // note we use to_system to dispatch the for_each
+  thrust::for_each(to_system, begin, end, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+}
+
+
+// XXX it's regrettable that this implementation is copied almost
+//     exactly from system::detail::generic::uninitialized_copy_n
+//     perhaps generic::uninitialized_copy_n could call this routine
+//     with a default allocator
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
+  typename enable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator_n(Allocator &a,
+                                        thrust::execution_policy<FromSystem> &from_system,
+                                        thrust::execution_policy<ToSystem> &to_system,
+                                        InputIterator first,
+                                        Size n,
+                                        Pointer result)
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<Pointer>::value_type       OutputType;
+
+  // do the for_each_n
+  // note we use to_system to dispatch the for_each_n
+  ZipIterator end = thrust::for_each_n(to_system, begin, n, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+}
+
+
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
+  typename disable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator(Allocator &,
+                                      thrust::execution_policy<FromSystem> &from_system,
+                                      thrust::execution_policy<ToSystem> &to_system,
+                                      InputIterator first,
+                                      InputIterator last,
+                                      Pointer result)
+{
+  // the systems aren't trivially interoperable
+  // just call two_system_copy and hope for the best
+  return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
+} // end uninitialized_copy_with_allocator()
+
+
+template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
+  typename disable_if_convertible<
+    FromSystem,
+    ToSystem,
+    Pointer
+  >::type
+    uninitialized_copy_with_allocator_n(Allocator &,
+                                        thrust::execution_policy<FromSystem> &from_system,
+                                        thrust::execution_policy<ToSystem> &to_system,
+                                        InputIterator first,
+                                        Size n,
+                                        Pointer result)
+{
+  // the systems aren't trivially interoperable
+  // just call two_system_copy_n and hope for the best
+  return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
+} // end uninitialized_copy_with_allocator_n()
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
+  typename disable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
+                         Allocator &a,
+                         InputIterator first,
+                         InputIterator last,
+                         Pointer result)
+{
+  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
+
+  // just call two_system_copy
+  return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+  typename disable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
+                           Allocator &a,
+                           InputIterator first,
+                           Size n,
+                           Pointer result)
+{
+  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
+
+  // just call two_system_copy_n
+  return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
+  typename enable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
+                         Allocator &a,
+                         InputIterator first,
+                         InputIterator last,
+                         Pointer result)
+{
+  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
+  return uninitialized_copy_with_allocator(a, from_system, to_system, first, last, result);
+}
+
+
+template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+  typename enable_if<
+    needs_copy_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value,
+    Pointer
+  >::type
+    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
+                           Allocator &a,
+                           InputIterator first,
+                           Size n,
+                           Pointer result)
+{
+  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
+  return uninitialized_copy_with_allocator_n(a, from_system, to_system, first, n, result);
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename System, typename Allocator, typename InputIterator, typename Pointer>
+  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
+                               Allocator &a,
+                               InputIterator first,
+                               InputIterator last,
+                               Pointer result)
+{
+  return allocator_traits_detail::copy_construct_range(from_system, a, first, last, result);
+}
+
+
+template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
+  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
+                                 Allocator &a,
+                                 InputIterator first,
+                                 Size n,
+                                 Pointer result)
+{
+  return allocator_traits_detail::copy_construct_range_n(from_system, a, first, n, result);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/default_construct_range.h b/compat/thrust/detail/allocator/default_construct_range.h
new file mode 100644
index 0000000..d83cb31
--- /dev/null
+++ b/compat/thrust/detail/allocator/default_construct_range.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Allocator, typename Pointer, typename Size>
+inline void default_construct_range(Allocator &a, Pointer p, Size n);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/default_construct_range.inl>
+
+
diff --git a/compat/thrust/detail/allocator/default_construct_range.inl b/compat/thrust/detail/allocator/default_construct_range.inl
new file mode 100644
index 0000000..45fe9c6
--- /dev/null
+++ b/compat/thrust/detail/allocator/default_construct_range.inl
@@ -0,0 +1,105 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+template<typename Allocator>
+  struct construct1_via_allocator
+{
+  Allocator &a;
+
+  construct1_via_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::construct(a, &x);
+  }
+};
+
+
+template<typename Allocator, typename T>
+  struct needs_default_construct_via_allocator
+    : has_member_construct1<
+        Allocator,
+        T
+      >
+{};
+
+
+// we know that std::allocator::construct's only effect is to call T's 
+// default constructor, so we needn't use it for default construction
+template<typename U, typename T>
+  struct needs_default_construct_via_allocator<std::allocator<U>, T>
+    : thrust::detail::false_type
+{};
+
+
+template<typename Allocator, typename Pointer, typename Size>
+  typename enable_if<
+    needs_default_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value
+  >::type
+    default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct1_via_allocator<Allocator>(a));
+}
+
+
+template<typename Allocator, typename Pointer, typename Size>
+  typename disable_if<
+    needs_default_construct_via_allocator<
+      Allocator,
+      typename pointer_element<Pointer>::type
+    >::value
+  >::type
+    default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, typename pointer_element<Pointer>::type());
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Allocator, typename Pointer, typename Size>
+  void default_construct_range(Allocator &a, Pointer p, Size n)
+{
+  return allocator_traits_detail::default_construct_range(a,p,n);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/destroy_range.h b/compat/thrust/detail/allocator/destroy_range.h
new file mode 100644
index 0000000..d690a60
--- /dev/null
+++ b/compat/thrust/detail/allocator/destroy_range.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename Allocator, typename Pointer, typename Size>
+  inline void destroy_range(Allocator &a, Pointer p, Size n);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/destroy_range.inl>
+
diff --git a/compat/thrust/detail/allocator/destroy_range.inl b/compat/thrust/detail/allocator/destroy_range.inl
new file mode 100644
index 0000000..ace2223
--- /dev/null
+++ b/compat/thrust/detail/allocator/destroy_range.inl
@@ -0,0 +1,158 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/allocator/destroy_range.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <memory>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+
+// destroy_range has three cases:
+// if Allocator has an effectful member function destroy:
+//   1. destroy via the allocator
+// else
+//   2. if T has a non-trivial destructor, destroy the range without using the allocator
+//   3. if T has a trivial destructor, do a no-op
+
+template<typename Allocator, typename T>
+  struct has_effectful_member_destroy
+    : has_member_destroy<Allocator,T>
+{};
+
+// std::allocator::destroy's only effect is to invoke its argument's destructor
+template<typename U, typename T>
+  struct has_effectful_member_destroy<std::allocator<U>, T>
+    : thrust::detail::false_type
+{};
+
+// case 1: Allocator has an effectful 1-argument member function "destroy"
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case1
+    : thrust::detail::enable_if<
+        has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+// case 2: Allocator has no member function "destroy", but T has a non-trivial destructor
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case2
+    : thrust::detail::enable_if<
+        !has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value &&
+        !has_trivial_destructor<
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+// case 3: Allocator has no member function "destroy", and T has a trivial destructor
+template<typename Allocator, typename Pointer>
+  struct enable_if_destroy_range_case3
+    : thrust::detail::enable_if<
+        !has_effectful_member_destroy<
+          Allocator,
+          typename pointer_element<Pointer>::type
+        >::value &&
+        has_trivial_destructor<
+          typename pointer_element<Pointer>::type
+        >::value
+      >
+{};
+
+
+
+template<typename Allocator>
+  struct destroy_via_allocator
+{
+  Allocator &a;
+
+  destroy_via_allocator(Allocator &a)
+    : a(a)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::destroy(a, &x);
+  }
+};
+
+
+// destroy_range case 1: destroy via allocator
+template<typename Allocator, typename Pointer, typename Size>
+  typename enable_if_destroy_range_case1<Allocator,Pointer>::type
+    destroy_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, destroy_via_allocator<Allocator>(a));
+}
+
+
+// we must prepare for His coming
+struct gozer
+{
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    x.~T();
+  }
+};
+
+// destroy_range case 2: destroy without the allocator
+template<typename Allocator, typename Pointer, typename Size>
+  typename enable_if_destroy_range_case2<Allocator,Pointer>::type
+    destroy_range(Allocator &a, Pointer p, Size n)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, gozer());
+}
+
+
+// destroy_range case 3: no-op
+template<typename Allocator, typename Pointer, typename Size>
+  typename enable_if_destroy_range_case3<Allocator,Pointer>::type
+    destroy_range(Allocator &, Pointer, Size)
+{
+  // no op
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Allocator, typename Pointer, typename Size>
+  void destroy_range(Allocator &a, Pointer p, Size n)
+{
+  return allocator_traits_detail::destroy_range(a,p,n);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/fill_construct_range.h b/compat/thrust/detail/allocator/fill_construct_range.h
new file mode 100644
index 0000000..66fec41
--- /dev/null
+++ b/compat/thrust/detail/allocator/fill_construct_range.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+inline void fill_construct_range(Allocator &a, Pointer p, Size n, const T &value);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/fill_construct_range.inl>
+
diff --git a/compat/thrust/detail/allocator/fill_construct_range.inl b/compat/thrust/detail/allocator/fill_construct_range.inl
new file mode 100644
index 0000000..e2c9c09
--- /dev/null
+++ b/compat/thrust/detail/allocator/fill_construct_range.inl
@@ -0,0 +1,109 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/for_each.h>
+#include <thrust/uninitialized_fill.h>
+#include <memory>
+
+namespace thrust
+{
+namespace detail
+{
+namespace allocator_traits_detail
+{
+
+// fill_construct_range has 2 cases:
+// if Allocator has an effectful member function construct:
+//   1. construct via the allocator
+// else
+//   2. construct via uninitialized_fill
+
+template<typename Allocator, typename T, typename Arg1>
+  struct has_effectful_member_construct2
+    : has_member_construct2<Allocator,T,Arg1>
+{};
+
+// std::allocator::construct's only effect is to invoke placement new
+template<typename U, typename T, typename Arg1>
+  struct has_effectful_member_construct2<std::allocator<U>,T,Arg1>
+    : thrust::detail::false_type
+{};
+
+
+template<typename Allocator, typename Arg1>
+  struct construct2_via_allocator
+{
+  Allocator &a;
+  Arg1 arg;
+
+  construct2_via_allocator(Allocator &a, const Arg1 &arg)
+    : a(a), arg(arg)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  void operator()(T &x)
+  {
+    allocator_traits<Allocator>::construct(a, &x, arg);
+  }
+};
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+  typename enable_if<
+    has_effectful_member_construct2<
+      Allocator,
+      typename pointer_element<Pointer>::type,
+      T
+    >::value
+  >::type
+    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
+{
+  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct2_via_allocator<Allocator,T>(a, value));
+}
+
+
+template<typename Allocator, typename Pointer, typename Size, typename T>
+  typename disable_if<
+    has_effectful_member_construct2<
+      Allocator,
+      typename pointer_element<Pointer>::type,
+      T
+    >::value
+  >::type
+    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
+{
+  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, value);
+}
+
+
+} // end allocator_traits_detail
+
+
+template<typename Alloc, typename Pointer, typename Size, typename T>
+  void fill_construct_range(Alloc &a, Pointer p, Size n, const T &value)
+{
+  return allocator_traits_detail::fill_construct_range(a,p,n,value);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/malloc_allocator.h b/compat/thrust/detail/allocator/malloc_allocator.h
new file mode 100644
index 0000000..cf4567e
--- /dev/null
+++ b/compat/thrust/detail/allocator/malloc_allocator.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename T, typename System, typename Pointer>
+  class malloc_allocator
+    : public thrust::detail::tagged_allocator<
+               T, System, Pointer
+             >
+{
+  private:
+    typedef thrust::detail::tagged_allocator<
+      T, System, Pointer
+    > super_t;
+
+  public:
+    typedef typename super_t::pointer   pointer;
+    typedef typename super_t::size_type size_type;
+
+    pointer allocate(size_type cnt);
+
+    void deallocate(pointer p, size_type n);
+};
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/malloc_allocator.inl>
+
diff --git a/compat/thrust/detail/allocator/malloc_allocator.inl b/compat/thrust/detail/allocator/malloc_allocator.inl
new file mode 100644
index 0000000..dd70202
--- /dev/null
+++ b/compat/thrust/detail/allocator/malloc_allocator.inl
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System, typename Pointer>
+  typename malloc_allocator<T,System,Pointer>::pointer
+    malloc_allocator<T,System,Pointer>
+      ::allocate(typename malloc_allocator<T,System,Pointer>::size_type cnt)
+{
+  using thrust::system::detail::generic::select_system;
+
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  System system;
+
+  pointer result = thrust::malloc<T>(select_system(system), cnt);
+
+  if(result.get() == 0)
+  {
+    throw thrust::system::detail::bad_alloc("malloc_allocator::allocate: malloc failed");
+  } // end if
+
+  return result;
+} // end malloc_allocator::allocate()
+
+
+template<typename T, typename System, typename Pointer>
+  void malloc_allocator<T,System,Pointer>
+    ::deallocate(typename malloc_allocator<T,System,Pointer>::pointer p, typename malloc_allocator<T,System,Pointer>::size_type n)
+{
+  using thrust::system::detail::generic::select_system;
+
+  System system;
+  thrust::free(select_system(system), p);
+} // end malloc_allocator
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/no_throw_allocator.h b/compat/thrust/detail/allocator/no_throw_allocator.h
new file mode 100644
index 0000000..ce397db
--- /dev/null
+++ b/compat/thrust/detail/allocator/no_throw_allocator.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename BaseAllocator>
+  struct no_throw_allocator : BaseAllocator
+{
+  private:
+    typedef BaseAllocator super_t;
+  
+  public:
+    inline no_throw_allocator(const BaseAllocator &other = BaseAllocator())
+      : super_t(other)
+    {}
+
+    template<typename U>
+      struct rebind
+    {
+      typedef no_throw_allocator<typename super_t::template rebind<U>::other> other;
+    }; // end rebind
+
+    void deallocate(typename super_t::pointer p, typename super_t::size_type n)
+    {
+      try
+      {
+        super_t::deallocate(p, n);
+      } // end try
+      catch(...)
+      {
+        // catch anything
+      } // end catch
+    } // end deallocate()
+
+    inline bool operator==(no_throw_allocator const &other) { return super_t::operator==(other); }
+    inline bool operator!=(no_throw_allocator const &other) { return super_t::operator!=(other); }
+}; // end no_throw_allocator
+
+} // end detail
+} // end thrust
+
+
diff --git a/compat/thrust/detail/allocator/tagged_allocator.h b/compat/thrust/detail/allocator/tagged_allocator.h
new file mode 100644
index 0000000..3cb87a3
--- /dev/null
+++ b/compat/thrust/detail/allocator/tagged_allocator.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename T, typename Tag, typename Pointer> class tagged_allocator;
+
+template<typename Tag, typename Pointer>
+  class tagged_allocator<void, Tag, Pointer>
+{
+  public:
+    typedef void                                                                                 value_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<void>::other       pointer;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const void>::other const_pointer;
+    typedef std::size_t                                                                          size_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::difference_type                    difference_type;
+    typedef Tag                                                                                  system_type;
+
+    template<typename U>
+      struct rebind
+    {
+      typedef tagged_allocator<U,Tag,Pointer> other;
+    }; // end rebind
+};
+
+template<typename T, typename Tag, typename Pointer>
+  class tagged_allocator
+{
+  public:
+    typedef T                                                                                 value_type;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<T>::other       pointer;
+    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const T>::other const_pointer;
+    typedef typename thrust::iterator_reference<pointer>::type                                reference;
+    typedef typename thrust::iterator_reference<const_pointer>::type                          const_reference;
+    typedef std::size_t                                                                       size_type;
+    typedef typename thrust::detail::pointer_traits<pointer>::difference_type                 difference_type;
+    typedef Tag                                                                               system_type;
+
+    template<typename U>
+      struct rebind
+    {
+      typedef tagged_allocator<U,Tag,Pointer> other;
+    }; // end rebind
+
+    __host__ __device__
+    inline tagged_allocator();
+
+    __host__ __device__
+    inline tagged_allocator(const tagged_allocator &);
+
+    template<typename U, typename OtherPointer>
+    __host__ __device__
+    inline tagged_allocator(const tagged_allocator<U, Tag, OtherPointer> &);
+
+    __host__ __device__
+    inline ~tagged_allocator();
+
+    __host__ __device__
+    pointer address(reference x) const;
+
+    __host__ __device__
+    const_pointer address(const_reference x) const;
+
+    size_type max_size() const;
+};
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/tagged_allocator.inl>
+
diff --git a/compat/thrust/detail/allocator/tagged_allocator.inl b/compat/thrust/detail/allocator/tagged_allocator.inl
new file mode 100644
index 0000000..cb362a8
--- /dev/null
+++ b/compat/thrust/detail/allocator/tagged_allocator.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+#include <limits>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename Tag, typename Pointer>
+  tagged_allocator<T,Tag,Pointer>
+    ::tagged_allocator()
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  tagged_allocator<T,Tag,Pointer>
+    ::tagged_allocator(const tagged_allocator<T,Tag,Pointer> &)
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  template<typename U, typename OtherPointer>
+    tagged_allocator<T,Tag,Pointer>
+      ::tagged_allocator(const tagged_allocator<U,Tag,OtherPointer> &)
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  tagged_allocator<T,Tag,Pointer>
+    ::~tagged_allocator()
+{}
+
+
+template<typename T, typename Tag, typename Pointer>
+  typename tagged_allocator<T,Tag,Pointer>::pointer
+    tagged_allocator<T,Tag,Pointer>
+      ::address(reference x) const
+{
+  return &x;
+}
+
+
+template<typename T, typename Tag, typename Pointer>
+  typename tagged_allocator<T,Tag,Pointer>::const_pointer
+    tagged_allocator<T,Tag,Pointer>
+      ::address(const_reference x) const
+{
+  return &x;
+}
+
+
+template<typename T, typename Tag, typename Pointer>
+  typename tagged_allocator<T,Tag,Pointer>::size_type
+    tagged_allocator<T,Tag,Pointer>
+      ::max_size() const
+{
+  return (std::numeric_limits<size_type>::max)() / sizeof(T);
+}
+
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
+{
+  return true;
+}
+
+
+template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
+__host__ __device__
+bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
+{
+  return false;
+}
+    
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/allocator/temporary_allocator.h b/compat/thrust/detail/allocator/temporary_allocator.h
new file mode 100644
index 0000000..f0496f9
--- /dev/null
+++ b/compat/thrust/detail/allocator/temporary_allocator.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/tagged_allocator.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/pair.h>
+#include <thrust/memory.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// XXX the pointer parameter given to tagged_allocator should be related to
+//     the type of the expression get_temporary_buffer(system, n).first
+//     without decltype, compromise on pointer<T,System>
+template<typename T, typename System>
+  class temporary_allocator
+    : public thrust::detail::tagged_allocator<
+               T, System, thrust::pointer<T,System>
+             >
+{
+  private:
+    typedef thrust::detail::tagged_allocator<
+      T, System, thrust::pointer<T,System>
+    > super_t;
+
+    System &m_system;
+
+  public:
+    typedef typename super_t::pointer   pointer;
+    typedef typename super_t::size_type size_type;
+
+    inline explicit temporary_allocator(thrust::execution_policy<System> &system) :
+      super_t(),
+      m_system(thrust::detail::derived_cast(system))
+    {}
+
+    pointer allocate(size_type cnt);
+
+    void deallocate(pointer p, size_type n);
+
+    inline System &system()
+    {
+      return m_system;
+    } // end system()
+
+  private:
+    typedef thrust::pair<pointer, size_type> pointer_and_size;
+}; // end temporary_allocator
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/allocator/temporary_allocator.inl>
+
diff --git a/compat/thrust/detail/allocator/temporary_allocator.inl b/compat/thrust/detail/allocator/temporary_allocator.inl
new file mode 100644
index 0000000..63221d5
--- /dev/null
+++ b/compat/thrust/detail/allocator/temporary_allocator.inl
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/allocator/temporary_allocator.h>
+#include <thrust/detail/temporary_buffer.h>
+#include <thrust/system/detail/bad_alloc.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System>
+  typename temporary_allocator<T,System>::pointer
+    temporary_allocator<T,System>
+      ::allocate(typename temporary_allocator<T,System>::size_type cnt)
+{
+  pointer_and_size result = thrust::get_temporary_buffer<T>(system(), cnt);
+
+  // handle failure
+  if(result.second < cnt)
+  {
+    // deallocate and throw
+    // note that we pass cnt to deallocate, not a value derived from result.second
+    deallocate(result.first, cnt);
+
+    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+  } // end if
+
+  return result.first;
+} // end temporary_allocator::allocate()
+
+
+template<typename T, typename System>
+  void temporary_allocator<T,System>
+    ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type n)
+{
+  return thrust::return_temporary_buffer(system(), p);
+} // end temporary_allocator
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/binary_search.inl b/compat/thrust/detail/binary_search.inl
new file mode 100644
index 0000000..0fd799a
--- /dev/null
+++ b/compat/thrust/detail/binary_search.inl
@@ -0,0 +1,458 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.inl
+ *  \brief Inline file for binary_search.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/binary_search.h>
+#include <thrust/system/detail/adl/binary_search.h>
+
+namespace thrust
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const LessThanComparable &value)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T &value,
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::equal_range;
+    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::equal_range;
+    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::lower_bound;
+    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::upper_bound;
+    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::binary_search;
+    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
+}
+
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+template <typename ForwardIterator, typename LessThanComparable>
+ForwardIterator lower_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+
+    System system;
+
+    return thrust::lower_bound(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+
+    System system;
+
+    return thrust::lower_bound(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+ForwardIterator upper_bound(ForwardIterator first, 
+                            ForwardIterator last,
+                            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::upper_bound(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::upper_bound(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+bool binary_search(ForwardIterator first, 
+                   ForwardIterator last,
+                   const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::binary_search(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(ForwardIterator first,
+                   ForwardIterator last,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::binary_search(select_system(system), first, last, value, comp);
+}
+
+template <typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::equal_range(select_system(system), first, last, value);
+}
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator, ForwardIterator>
+equal_range(ForwardIterator first,
+            ForwardIterator last,
+            const T& value,
+            StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+    System system;
+
+    return thrust::equal_range(select_system(system), first, last, value, comp);
+}
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+    
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(ForwardIterator first, 
+                           ForwardIterator last,
+                           InputIterator values_first, 
+                           InputIterator values_last,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output);
+}
+
+template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(ForwardIterator first, 
+                             ForwardIterator last,
+                             InputIterator values_first, 
+                             InputIterator values_last,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+    typedef typename thrust::iterator_system<InputIterator>::type   System2;
+    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
+
+    System1 system1;
+    System2 system2;
+    System3 system3;
+
+    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
+}
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/config.h b/compat/thrust/detail/config.h
new file mode 100644
index 0000000..d6b6691
--- /dev/null
+++ b/compat/thrust/detail/config.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/*! \file config.h
+ *  \brief Defines platform configuration.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/config.h>
+
diff --git a/compat/thrust/detail/config/compiler.h b/compat/thrust/detail/config/compiler.h
new file mode 100644
index 0000000..90ce911
--- /dev/null
+++ b/compat/thrust/detail/config/compiler.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file compiler.h
+ *  \brief Compiler-specific configuration
+ */
+
+#pragma once
+
+#ifdef __CUDACC__
+
+#include <cuda.h>
+
+// Thrust supports CUDA >= 3.0
+#if CUDA_VERSION < 3000
+#error "CUDA v3.0 or newer is required"
+#endif // CUDA_VERSION
+
+#endif // __CUDACC__
+
+// enumerate host compilers we know about
+#define THRUST_HOST_COMPILER_UNKNOWN 0
+#define THRUST_HOST_COMPILER_MSVC    1
+#define THRUST_HOST_COMPILER_GCC     2
+
+// enumerate host compilers we know about
+#define THRUST_DEVICE_COMPILER_UNKNOWN 0
+#define THRUST_DEVICE_COMPILER_MSVC    1
+#define THRUST_DEVICE_COMPILER_GCC     2
+#define THRUST_DEVICE_COMPILER_NVCC    3
+
+// figure out which host compiler we're using
+// XXX we should move the definition of THRUST_DEPRECATED out of this logic
+#if   defined(_MSC_VER)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
+#define THRUST_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
+#define THRUST_DEPRECATED __attribute__ ((deprecated)) 
+#define THRUST_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
+#define THRUST_DEPRECATED
+#endif // THRUST_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__)
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
+#else
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
+#endif
+
+// is the device compiler capable of compiling omp?
+#ifdef _OPENMP
+#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
+#else
+#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_FALSE
+#endif // _OPENMP
+
+// disable specific MSVC warnings
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && !defined(__CUDA_ARCH__)
+#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x) \
+__pragma(warning(push)) \
+__pragma(warning(disable : x))
+#define __THRUST_DISABLE_MSVC_WARNING_END(x) \
+__pragma(warning(pop))
+#else
+#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x)
+#define __THRUST_DISABLE_MSVC_WARNING_END(x)
+#endif
+#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(x) \
+__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267) \
+x;\
+__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
+#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN \
+__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267)
+#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END \
+__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
+#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL(x) \
+__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800) \
+x;\
+__THRUST_DISABLE_MSVC_WARNING_END(4800)
+#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN \
+__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800)
+#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END \
+__THRUST_DISABLE_MSVC_WARNING_END(4800)
diff --git a/compat/thrust/detail/config/compiler_fence.h b/compat/thrust/detail/config/compiler_fence.h
new file mode 100644
index 0000000..f5cbf98
--- /dev/null
+++ b/compat/thrust/detail/config/compiler_fence.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// msvc case
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+
+#ifndef _DEBUG
+
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+#define __thrust_compiler_fence() _ReadWriteBarrier()
+#else
+
+#define __thrust_compiler_fence() do {} while (0)
+
+#endif // _DEBUG
+
+// gcc case
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+
+#if THRUST_GCC_VERSION >= 40200 // atomic built-ins were introduced ~4.2
+#define __thrust_compiler_fence() __sync_synchronize()
+#else
+// allow the code to compile without any guarantees
+#define __thrust_compiler_fence() do {} while (0)
+#endif // THRUST_GCC_VERSION
+
+// unknown case
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_UNKNOWN
+
+// allow the code to compile without any guarantees
+#define __thrust_compiler_fence() do {} while (0)
+
+#endif
+
diff --git a/compat/thrust/detail/config/config.h b/compat/thrust/detail/config/config.h
new file mode 100644
index 0000000..f3498ac
--- /dev/null
+++ b/compat/thrust/detail/config/config.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file config.h
+ *  \brief Defines platform configuration.
+ */
+
+#pragma once
+
+// XXX the order of these #includes matters
+
+#include <thrust/detail/config/simple_defines.h>
+#include <thrust/detail/config/compiler.h>
+// host_system.h & device_system.h must be #included as early as possible
+// because other config headers depend on it
+#include <thrust/detail/config/host_system.h>
+#include <thrust/detail/config/device_system.h>
+#include <thrust/detail/config/host_device.h>
+#include <thrust/detail/config/debug.h>
+#include <thrust/detail/config/compiler_fence.h>
+#include <thrust/detail/config/forceinline.h>
+#include <thrust/detail/config/hd_warning_disable.h>
+
diff --git a/compat/thrust/detail/config/debug.h b/compat/thrust/detail/config/debug.h
new file mode 100644
index 0000000..56c1bad
--- /dev/null
+++ b/compat/thrust/detail/config/debug.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#ifndef THRUST_DEBUG
+#  ifndef NDEBUG
+#    if (DEBUG || _DEBUG)
+#      define THRUST_DEBUG 1
+#    endif // (DEBUG || _DEBUG)
+#  endif // NDEBUG
+#endif // THRUST_DEBUG
+
+#if THRUST_DEBUG
+#  ifndef __THRUST_SYNCHRONOUS
+#    define __THRUST_SYNCHRONOUS 1
+#  endif // __THRUST_SYNCHRONOUS
+#endif // THRUST_DEBUG
+
diff --git a/compat/thrust/detail/config/device_system.h b/compat/thrust/detail/config/device_system.h
new file mode 100644
index 0000000..a104906
--- /dev/null
+++ b/compat/thrust/detail/config/device_system.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// reserve 0 for undefined
+#define THRUST_DEVICE_SYSTEM_CUDA    1
+#define THRUST_DEVICE_SYSTEM_OMP     2
+#define THRUST_DEVICE_SYSTEM_TBB     3
+#define THRUST_DEVICE_SYSTEM_CPP     4
+
+#ifndef THRUST_DEVICE_SYSTEM
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
+#endif // THRUST_DEVICE_SYSTEM
+
+// XXX make the use of THRUST_DEVICE_BACKEND an error in Thrust 1.7
+// XXX eliminate the following in Thrust 1.7
+
+#define THRUST_DEVICE_BACKEND_CUDA THRUST_DEVICE_SYSTEM_CUDA
+#define THRUST_DEVICE_BACKEND_OMP  THRUST_DEVICE_SYSTEM_OMP
+#define THRUST_DEVICE_BACKEND_TBB  THRUST_DEVICE_SYSTEM_TBB
+
+#ifdef THRUST_DEVICE_BACKEND
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#    pragma message("----------------------------------------------------------------------------------")
+#    pragma message("| WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |")
+#    pragma message("----------------------------------------------------------------------------------")
+#  else
+#    warning ----------------------------------------------------------------------------------
+#    warning | WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |
+#    warning ----------------------------------------------------------------------------------
+#  endif // THRUST_HOST_COMPILER
+#  undef THRUST_DEVICE_SYSTEM
+#  define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND
+#endif // THRUST_DEVICE_BACKEND
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE cuda
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE omp
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE tbb
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+#define __THRUST_DEVICE_SYSTEM_NAMESPACE cpp
+#endif
+
+#define __THRUST_DEVICE_SYSTEM_ROOT thrust/system/__THRUST_DEVICE_SYSTEM_NAMESPACE
+
diff --git a/compat/thrust/detail/config/forceinline.h b/compat/thrust/detail/config/forceinline.h
new file mode 100644
index 0000000..620769b
--- /dev/null
+++ b/compat/thrust/detail/config/forceinline.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file forceinline.h
+ *  \brief Defines __thrust_forceinline__
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(__CUDACC__)
+
+#define __thrust_forceinline__ __forceinline__
+
+#else
+
+// TODO add 
+
+#define __thrust_forceinline__
+
+#endif
+
diff --git a/compat/thrust/detail/config/hd_warning_disable.h b/compat/thrust/detail/config/hd_warning_disable.h
new file mode 100644
index 0000000..b993ef2
--- /dev/null
+++ b/compat/thrust/detail/config/hd_warning_disable.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file hd_warning_disable.h
+ *  \brief Defines __thrust_hd_warning_disable__
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(__CUDACC__)
+
+#define __thrust_hd_warning_disable__ \
+#pragma hd_warning_disable
+#else
+
+#define __thrust_hd_warning_disable__
+
+#endif
+
+
diff --git a/compat/thrust/detail/config/host_device.h b/compat/thrust/detail/config/host_device.h
new file mode 100644
index 0000000..5d0975d
--- /dev/null
+++ b/compat/thrust/detail/config/host_device.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file host_device.h
+ *  \brief Defines __host__ and __device__ and other CUDA-isms
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+
+#include <host_defines.h>
+
+#else
+
+// since __host__ & __device__ might have already be defined, only
+// #define them if not defined already
+// XXX this will break if the client does #include <host_defines.h> later
+
+#ifndef __host__
+#define __host__
+#endif // __host__
+
+#ifndef __device__
+#define __device__
+#endif // __device__
+
+#endif
+
diff --git a/compat/thrust/detail/config/host_system.h b/compat/thrust/detail/config/host_system.h
new file mode 100644
index 0000000..fb8edab
--- /dev/null
+++ b/compat/thrust/detail/config/host_system.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// reserve 0 for undefined
+#define THRUST_HOST_SYSTEM_CPP    1
+#define THRUST_HOST_SYSTEM_OMP    2
+#define THRUST_HOST_SYSTEM_TBB    3
+
+#ifndef THRUST_HOST_SYSTEM
+#define THRUST_HOST_SYSTEM THRUST_HOST_SYSTEM_CPP
+#endif // THRUST_HOST_SYSTEM
+
+// XXX make the use of THRUST_HOST_BACKEND an error in Thrust 1.7
+// XXX eliminate the following in Thrust 1.7
+
+#define THRUST_HOST_BACKEND_CPP THRUST_HOST_SYSTEM_CPP
+#define THRUST_HOST_BACKEND_OMP THRUST_HOST_SYSTEM_OMP
+#define THRUST_HOST_BACKEND_TBB THRUST_HOST_SYSTEM_TBB
+
+#ifdef THRUST_HOST_BACKEND
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#    pragma message("------------------------------------------------------------------------------")
+#    pragma message("| WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |")
+#    pragma message("------------------------------------------------------------------------------")
+#  else
+#    warning ------------------------------------------------------------------------------
+#    warning | WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |
+#    warning ------------------------------------------------------------------------------
+#  endif // THRUST_HOST_COMPILER
+#  undef THRUST_HOST_SYSTEM
+#  define THRUST_HOST_SYSTEM THRUST_HOST_BACKEND
+#endif // THRUST_HOST_BACKEND
+
+#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
+#define __THRUST_HOST_SYSTEM_NAMESPACE cpp
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
+#define __THRUST_HOST_SYSTEM_NAMESPACE omp
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
+#define __THRUST_HOST_SYSTEM_NAMESPACE tbb
+#endif
+
+#define __THRUST_HOST_SYSTEM_ROOT thrust/system/__THRUST_HOST_SYSTEM_NAMESPACE
+
diff --git a/compat/thrust/detail/config/simple_defines.h b/compat/thrust/detail/config/simple_defines.h
new file mode 100644
index 0000000..f9510ee
--- /dev/null
+++ b/compat/thrust/detail/config/simple_defines.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file simple_defines.h
+ *  \brief Primitive macros without dependencies.
+ */
+
+#pragma once
+
+#define THRUST_UNKNOWN 0
+#define THRUST_FALSE   0
+#define THRUST_TRUE    1
+
+#define THRUST_PREVENT_MACRO_SUBSTITUTION
+
diff --git a/compat/thrust/detail/contiguous_storage.h b/compat/thrust/detail/contiguous_storage.h
new file mode 100644
index 0000000..fe72bce
--- /dev/null
+++ b/compat/thrust/detail/contiguous_storage.h
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/detail/normal_iterator.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// XXX parameter T is redundant with parameter Alloc
+template<typename T, typename Alloc>
+  class contiguous_storage
+{
+  private:
+    typedef thrust::detail::allocator_traits<Alloc> alloc_traits;
+
+  public:
+    typedef Alloc                                      allocator_type;
+    typedef T                                          value_type;
+    typedef typename alloc_traits::pointer             pointer;
+    typedef typename alloc_traits::const_pointer       const_pointer;
+    typedef typename alloc_traits::size_type           size_type;
+    typedef typename alloc_traits::difference_type     difference_type;
+
+    // XXX we should bring reference & const_reference into allocator_traits
+    //     at the moment, it's unclear how -- we have nothing analogous to
+    //     rebind_pointer for references
+    //     we either need to add reference_traits or extend the existing
+    //     pointer_traits to support wrapped references
+    typedef typename Alloc::reference                  reference;
+    typedef typename Alloc::const_reference            const_reference;
+
+    typedef thrust::detail::normal_iterator<pointer>       iterator;
+    typedef thrust::detail::normal_iterator<const_pointer> const_iterator;
+
+    explicit contiguous_storage(const allocator_type &alloc = allocator_type());
+
+    explicit contiguous_storage(size_type n, const allocator_type &alloc = allocator_type());
+
+    ~contiguous_storage(void);
+
+    size_type size(void) const;
+
+    size_type max_size(void) const;
+
+    iterator begin(void);
+    
+    const_iterator begin(void) const;
+
+    iterator end(void);
+
+    const_iterator end(void) const;
+
+    reference operator[](size_type n);
+
+    const_reference operator[](size_type n) const;
+
+    allocator_type get_allocator(void) const;
+
+    // note that allocate does *not* automatically call deallocate
+    void allocate(size_type n);
+
+    void deallocate(void);
+
+    void swap(contiguous_storage &x);
+
+    void default_construct_n(iterator first, size_type n);
+
+    void uninitialized_fill_n(iterator first, size_type n, const value_type &value);
+
+    template<typename InputIterator>
+    iterator uninitialized_copy(InputIterator first, InputIterator last, iterator result);
+
+    template<typename System, typename InputIterator>
+    iterator uninitialized_copy(thrust::execution_policy<System> &from_system,
+                                InputIterator first,
+                                InputIterator last,
+                                iterator result);
+
+    template<typename InputIterator, typename Size>
+    iterator uninitialized_copy_n(InputIterator first, Size n, iterator result);
+
+    template<typename System, typename InputIterator, typename Size>
+    iterator uninitialized_copy_n(thrust::execution_policy<System> &from_system,
+                                  InputIterator first,
+                                  Size n,
+                                  iterator result);
+
+    void destroy(iterator first, iterator last);
+
+  private:
+    // XXX we could inherit from this to take advantage of empty base class optimization
+    allocator_type m_allocator;
+
+    iterator m_begin;
+    
+    size_type m_size;
+
+    // disallow assignment
+    contiguous_storage &operator=(const contiguous_storage &x);
+}; // end contiguous_storage
+
+} // end detail
+
+template<typename T, typename Alloc> void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs);
+
+} // end thrust
+
+#include <thrust/detail/contiguous_storage.inl>
+
diff --git a/compat/thrust/detail/contiguous_storage.inl b/compat/thrust/detail/contiguous_storage.inl
new file mode 100644
index 0000000..7e26c26
--- /dev/null
+++ b/compat/thrust/detail/contiguous_storage.inl
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/contiguous_storage.h>
+#include <thrust/detail/swap.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/allocator/copy_construct_range.h>
+#include <thrust/detail/allocator/default_construct_range.h>
+#include <thrust/detail/allocator/destroy_range.h>
+#include <thrust/detail/allocator/fill_construct_range.h>
+#include <utility> // for use of std::swap in the WAR below
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T, typename Alloc>
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(const Alloc &alloc)
+      :m_allocator(alloc),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  ;
+} // end contiguous_storage::contiguous_storage()
+
+template<typename T, typename Alloc>
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(size_type n, const Alloc &alloc)
+      :m_allocator(alloc),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  allocate(n);
+} // end contiguous_storage::contiguous_storage()
+
+template<typename T, typename Alloc>
+  contiguous_storage<T,Alloc>
+    ::~contiguous_storage(void)
+{
+  deallocate();
+} // end contiguous_storage::~contiguous_storage()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::size_type
+    contiguous_storage<T,Alloc>
+      ::size(void) const
+{
+  return m_size;
+} // end contiguous_storage::size()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::size_type
+    contiguous_storage<T,Alloc>
+      ::max_size(void) const
+{
+  return alloc_traits::max_size(m_allocator);
+} // end contiguous_storage::max_size()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::iterator
+    contiguous_storage<T,Alloc>
+      ::begin(void)
+{
+  return m_begin;
+} // end contiguous_storage::begin()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::const_iterator
+    contiguous_storage<T,Alloc>
+      ::begin(void) const
+{
+  return m_begin;
+} // end contiguous_storage::begin()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::iterator
+    contiguous_storage<T,Alloc>
+      ::end(void)
+{
+  return m_begin + size();
+} // end contiguous_storage::end()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::const_iterator
+    contiguous_storage<T,Alloc>
+      ::end(void) const
+{
+  return m_begin + size();
+} // end contiguous_storage::end()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::reference
+    contiguous_storage<T,Alloc>
+      ::operator[](size_type n)
+{
+  return m_begin[n];
+} // end contiguous_storage::operator[]()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::const_reference
+    contiguous_storage<T,Alloc>
+      ::operator[](size_type n) const
+{
+  return m_begin[n];
+} // end contiguous_storage::operator[]()
+
+template<typename T, typename Alloc>
+  typename contiguous_storage<T,Alloc>::allocator_type
+    contiguous_storage<T,Alloc>
+      ::get_allocator(void) const
+{
+  return m_allocator;
+} // end contiguous_storage::get_allocator()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::allocate(size_type n)
+{
+  if(n > 0)
+  {
+    m_begin = iterator(m_allocator.allocate(n));
+    m_size = n;
+  } // end if
+  else
+  {
+    m_begin = iterator(pointer(static_cast<T*>(0)));
+    m_size = 0;
+  } // end else
+} // end contiguous_storage::allocate()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::deallocate(void)
+{
+  if(size() > 0)
+  {
+    m_allocator.deallocate(m_begin.base(), size());
+    m_begin = iterator(pointer(static_cast<T*>(0)));
+    m_size = 0;
+  } // end if
+} // end contiguous_storage::deallocate()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::swap(contiguous_storage &x)
+{
+  thrust::swap(m_begin, x.m_begin);
+  thrust::swap(m_size, x.m_size);
+
+  // XXX WAR nvcc 4.0's "calling a __host__ function from a __host__ __device__ function is not allowed" warning
+  //thrust::swap(m_allocator, x.m_allocator);
+  std::swap(m_allocator, x.m_allocator);
+} // end contiguous_storage::swap()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::default_construct_n(iterator first, size_type n)
+{
+  default_construct_range(m_allocator, first.base(), n);
+} // end contiguous_storage::default_construct_n()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::uninitialized_fill_n(iterator first, size_type n, const value_type &x)
+{
+  fill_construct_range(m_allocator, first.base(), n, x);
+} // end contiguous_storage::uninitialized_fill()
+
+template<typename T, typename Alloc>
+  template<typename System, typename InputIterator>
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy(thrust::execution_policy<System> &from_system, InputIterator first, InputIterator last, iterator result)
+{
+  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
+} // end contiguous_storage::uninitialized_copy()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy(InputIterator first, InputIterator last, iterator result)
+{
+  // XXX assumes InputIterator's associated System is default-constructible
+  typename thrust::iterator_system<InputIterator>::type from_system;
+
+  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
+} // end contiguous_storage::uninitialized_copy()
+
+template<typename T, typename Alloc>
+  template<typename System, typename InputIterator, typename Size>
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy_n(thrust::execution_policy<System> &from_system, InputIterator first, Size n, iterator result)
+{
+  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
+} // end contiguous_storage::uninitialized_copy_n()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator, typename Size>
+    typename contiguous_storage<T,Alloc>::iterator
+      contiguous_storage<T,Alloc>
+        ::uninitialized_copy_n(InputIterator first, Size n, iterator result)
+{
+  // XXX assumes InputIterator's associated System is default-constructible
+  typename thrust::iterator_system<InputIterator>::type from_system;
+
+  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
+} // end contiguous_storage::uninitialized_copy_n()
+
+template<typename T, typename Alloc>
+  void contiguous_storage<T,Alloc>
+    ::destroy(iterator first, iterator last)
+{
+  destroy_range(m_allocator, first.base(), last - first);
+} // end contiguous_storage::destroy()
+
+} // end detail
+
+template<typename T, typename Alloc>
+  void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs)
+{
+  lhs.swap(rhs);
+} // end swap()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/copy.h b/compat/thrust/detail/copy.h
new file mode 100644
index 0000000..8ed3abd
--- /dev/null
+++ b/compat/thrust/detail/copy.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+template<typename System,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(const thrust::detail::execution_policy_base<System> &system,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+template<typename System,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<System> &system,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+namespace detail
+{
+
+
+template<typename FromSystem,
+         typename ToSystem,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator two_system_copy(thrust::execution_policy<FromSystem> &from_system,
+                                 thrust::execution_policy<ToSystem>   &two_system,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result);
+
+
+template<typename FromSystem,
+         typename ToSystem,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator two_system_copy_n(thrust::execution_policy<FromSystem> &from_system,
+                                   thrust::execution_policy<ToSystem>   &two_system,
+                                   InputIterator first,
+                                   Size n,
+                                   OutputIterator result);
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/copy.inl>
+
diff --git a/compat/thrust/detail/copy.inl b/compat/thrust/detail/copy.inl
new file mode 100644
index 0000000..9ac4807
--- /dev/null
+++ b/compat/thrust/detail/copy.inl
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/copy.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/system/detail/adl/copy.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  using thrust::system::detail::generic::copy;
+  return copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
+  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  using thrust::system::detail::generic::copy_n;
+  return copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
+} // end copy_n()
+
+
+namespace detail
+{
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator two_system_copy(thrust::execution_policy<System1> &system1,
+                                 thrust::execution_policy<System2> &system2,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  return thrust::copy(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, last, result);
+} // end two_system_copy()
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator two_system_copy_n(thrust::execution_policy<System1> &system1,
+                                   thrust::execution_policy<System2> &system2,
+                                   InputIterator first,
+                                   Size n,
+                                   OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  return thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, n, result);
+} // end two_system_copy_n()
+
+
+} // end detail
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::detail::two_system_copy(system1, system2, first, last, result);
+} // end copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::detail::two_system_copy_n(system1, system2, first, n, result);
+} // end copy_n()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/copy_if.h b/compat/thrust/detail/copy_if.h
new file mode 100644
index 0000000..54e1ef4
--- /dev/null
+++ b/compat/thrust/detail/copy_if.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+} // end thrust
+
+#include <thrust/detail/copy_if.inl>
+
diff --git a/compat/thrust/detail/copy_if.inl b/compat/thrust/detail/copy_if.inl
new file mode 100644
index 0000000..e443bb7
--- /dev/null
+++ b/compat/thrust/detail/copy_if.inl
@@ -0,0 +1,105 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/copy_if.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/adl/copy_if.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::copy_if;
+  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
+} // end copy_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::copy_if;
+  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
+} // end copy_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::copy_if(select_system(system1,system2), first, last, result, pred);
+} // end copy_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
+} // end copy_if()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/count.inl b/compat/thrust/detail/count.inl
new file mode 100644
index 0000000..d2856ae
--- /dev/null
+++ b/compat/thrust/detail/count.inl
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file count.inl
+ *  \brief Inline file for count.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/count.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/count.h>
+#include <thrust/system/detail/adl/count.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  using thrust::system::detail::generic::count;
+  return count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end count()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::count_if;
+  return count_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end count_if()
+
+
+template <typename InputIterator, typename EqualityComparable>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::count(select_system(system), first, last, value);
+} // end count()
+
+
+template <typename InputIterator, typename Predicate>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::count_if(select_system(system), first, last, pred);
+} // end count_if()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/cstdint.h b/compat/thrust/detail/cstdint.h
new file mode 100644
index 0000000..25d30fd
--- /dev/null
+++ b/compat/thrust/detail/cstdint.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)
+#include <stdint.h>
+#endif
+
+namespace thrust
+{
+namespace detail
+{
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+
+#if (_MSC_VER < 1300)
+   typedef signed   char     int8_t;
+   typedef signed   short    int16_t;
+   typedef signed   int      int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed   __int8   int8_t;
+   typedef signed   __int16  int16_t;
+   typedef signed   __int32  int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed   __int64     int64_t;
+typedef unsigned __int64     uint64_t;
+
+#else
+
+typedef ::int8_t   int8_t;
+typedef ::int16_t  int16_t;
+typedef ::int32_t  int32_t;
+typedef ::int64_t  int64_t;
+typedef ::uint8_t  uint8_t;
+typedef ::uint16_t uint16_t;
+typedef ::uint32_t uint32_t;
+typedef ::uint64_t uint64_t;
+
+#endif
+
+
+// an oracle to tell us how to define intptr_t
+template<int word_size = sizeof(void*)> struct divine_intptr_t;
+template<int word_size = sizeof(void*)> struct divine_uintptr_t;
+
+// 32b platforms
+template<>  struct divine_intptr_t<4>  {  typedef thrust::detail::int32_t  type; };
+template<>  struct divine_uintptr_t<4> {  typedef thrust::detail::uint32_t type; };
+
+// 64b platforms
+template<>  struct divine_intptr_t<8>  { typedef thrust::detail::int64_t  type; };
+template<>  struct divine_uintptr_t<8> { typedef thrust::detail::uint64_t type; };
+
+typedef divine_intptr_t<>::type   intptr_t;
+typedef divine_uintptr_t<>::type  uintptr_t;
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/device_delete.inl b/compat/thrust/detail/device_delete.inl
new file mode 100644
index 0000000..dd70d76
--- /dev/null
+++ b/compat/thrust/detail/device_delete.inl
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_delete.inl
+ *  \brief Inline file for device_delete.h.
+ */
+
+#include <thrust/device_delete.h>
+#include <thrust/device_free.h>
+#include <thrust/detail/allocator/destroy_range.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// define an empty allocator class to use below
+struct device_delete_allocator {};
+
+}
+
+template<typename T>
+  void device_delete(device_ptr<T> ptr,
+                     const size_t n)
+{
+  // we can use device_allocator to destroy the range
+  thrust::detail::device_delete_allocator a;
+  thrust::detail::destroy_range(a, ptr, n);
+  thrust::device_free(ptr);
+} // end device_delete()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/device_free.inl b/compat/thrust/detail/device_free.inl
new file mode 100644
index 0000000..ab8db9f
--- /dev/null
+++ b/compat/thrust/detail/device_free.inl
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_free.inl
+ *  \brief Inline file for device_free.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_free.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+
+void device_free(thrust::device_ptr<void> ptr)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  thrust::free(s, ptr);
+} // end device_free()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/device_malloc.inl b/compat/thrust/detail/device_malloc.inl
new file mode 100644
index 0000000..76d0029
--- /dev/null
+++ b/compat/thrust/detail/device_malloc.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc.inl
+ *  \brief Inline file for device_malloc.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_malloc.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+
+
+thrust::device_ptr<void> device_malloc(const std::size_t n)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  return thrust::device_ptr<void>(thrust::malloc(s, n).get());
+} // end device_malloc()
+
+
+template<typename T>
+  thrust::device_ptr<T> device_malloc(const std::size_t n)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
+
+  // XXX lower to select_system(system) here
+  system s;
+
+  return thrust::device_ptr<T>(thrust::malloc<T>(s,n).get());
+} // end device_malloc()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/device_new.inl b/compat/thrust/detail/device_new.inl
new file mode 100644
index 0000000..1f00a97
--- /dev/null
+++ b/compat/thrust/detail/device_new.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new.inl
+ *  \brief Inline file for device_new.h.
+ */
+
+#include <thrust/device_new.h>
+#include <thrust/device_malloc.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace thrust
+{
+
+template<typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const size_t n)
+{
+  // XXX TODO dispatch n null device constructors at p here
+  // in the meantime, dispatch 1 null host constructor here
+  // and dispatch n copy constructors
+  return device_new<T>(p, T(), n);
+} // end device_new()
+
+template<typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const T &exemplar,
+                           const size_t n)
+{
+  device_ptr<T> result(reinterpret_cast<T*>(p.get()));
+
+  // run copy constructors at p here
+  thrust::uninitialized_fill(result, result + n, exemplar);
+  
+  return result;
+} // end device_new()
+
+template<typename T>
+  device_ptr<T> device_new(const size_t n)
+{
+  // call placement new
+  return device_new<T>(thrust::device_malloc<T>(n));
+} // end device_new()
+
+} // thrust
+
diff --git a/compat/thrust/detail/device_ptr.inl b/compat/thrust/detail/device_ptr.inl
new file mode 100644
index 0000000..0afe8a1
--- /dev/null
+++ b/compat/thrust/detail/device_ptr.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_ptr.inl
+ *  \brief Inline file for device_ptr.h.
+ */
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <iostream>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+template<typename T>
+  device_ptr<T> device_pointer_cast(T *ptr)
+{
+  return device_ptr<T>(ptr);
+} // end device_pointer_cast()
+
+template<typename T>
+  device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr)
+{
+  return ptr;
+} // end device_pointer_cast()
+
+// output to ostream
+template<class E, class T, class Y>
+  std::basic_ostream<E, T> &operator<<(std::basic_ostream<E, T> &os, const device_ptr<Y> &p)
+{
+  return os << p.get();
+} // end operator<<()
+
+
+namespace detail
+{
+
+template<typename T>
+  struct is_device_ptr< thrust::device_ptr<T> >
+    : public true_type
+{
+}; // end is_device_ptr
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+// XXX WAR MSVC 2005 problem with correctly implementing
+//     pointer_raw_pointer for device_ptr by specializing it here
+template<typename T>
+  struct pointer_raw_pointer< thrust::device_ptr<T> >
+{
+  typedef typename device_ptr<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+#endif
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/device_reference.inl b/compat/thrust/detail/device_reference.inl
new file mode 100644
index 0000000..ad5cb76
--- /dev/null
+++ b/compat/thrust/detail/device_reference.inl
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_reference.inl
+ *  \brief Inline file for device_reference.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/device_reference.h>
+
+namespace thrust
+{
+
+template<typename T>
+  template<typename OtherT>
+    device_reference<T> &
+      device_reference<T>
+        ::operator=(const device_reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end operator=()
+
+template<typename T>
+  device_reference<T> &
+    device_reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end operator=()
+
+template<typename T>
+__host__ __device__
+void swap(device_reference<T> &a, device_reference<T> &b)
+{
+  a.swap(b);
+} // end swap()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/device_vector.inl b/compat/thrust/detail/device_vector.inl
new file mode 100644
index 0000000..f6bafba
--- /dev/null
+++ b/compat/thrust/detail/device_vector.inl
@@ -0,0 +1,37 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_vector.inl
+ *  \brief Inline file for device_vector.h.
+ */
+
+#include <thrust/host_vector.h>
+
+namespace thrust
+{
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    device_vector<T,Alloc>
+      ::device_vector(const host_vector<OtherT,OtherAlloc> &v)
+        :Parent(v)
+{
+  ;
+} // end device_vector::device_vector()
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/dispatch/is_trivial_copy.h b/compat/thrust/detail/dispatch/is_trivial_copy.h
new file mode 100644
index 0000000..2bedf1f
--- /dev/null
+++ b/compat/thrust/detail/dispatch/is_trivial_copy.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file trivial_copy.h
+ *  \brief Device implementations for copying memory between host and device.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/is_trivial_iterator.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+namespace dispatch
+{
+
+
+// a trivial copy's iterator's value_types match,
+// the iterators themselves are normal_iterators
+// and the ToIterator's value_type has_trivial_assign
+template<typename FromIterator, typename ToIterator>
+  struct is_trivial_copy :
+    integral_constant<
+      bool,
+      is_same<
+        typename thrust::iterator_value<FromIterator>::type,
+        typename thrust::iterator_value<ToIterator>::type
+      >::value
+      && is_trivial_iterator<FromIterator>::value
+      && is_trivial_iterator<ToIterator>::value
+      && has_trivial_assign<typename thrust::iterator_value<ToIterator>::type>::value
+    > {};
+
+} // end namespace dispatch
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/distance.inl b/compat/thrust/detail/distance.inl
new file mode 100644
index 0000000..f37595f
--- /dev/null
+++ b/compat/thrust/detail/distance.inl
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.inl
+ *  \brief Inline file for distance.h
+ */
+
+#include <thrust/advance.h>
+#include <thrust/system/detail/generic/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last)
+{
+  return thrust::system::detail::generic::distance(first, last);
+} // end distance()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/equal.inl b/compat/thrust/detail/equal.inl
new file mode 100644
index 0000000..ca6fecc
--- /dev/null
+++ b/compat/thrust/detail/equal.inl
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file equal.inl
+ *  \brief Inline file for equal.h.
+ */
+
+#include <thrust/equal.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/equal.h>
+#include <thrust/system/detail/adl/equal.h>
+
+namespace thrust
+{
+
+
+template<typename System, typename InputIterator1, typename InputIterator2>
+bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
+{
+  using thrust::system::detail::generic::equal;
+  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2);
+} // end equal()
+
+
+template<typename System, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::equal;
+  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2, binary_pred);
+} // end equal()
+
+
+template <typename InputIterator1, typename InputIterator2>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::equal(select_system(system1,system2), first1, last1, first2);
+}
+
+
+template <typename InputIterator1, typename InputIterator2, 
+          typename BinaryPredicate>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::equal(select_system(system1,system2), first1, last1, first2, binary_pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/execute_with_allocator.h b/compat/thrust/detail/execute_with_allocator.h
new file mode 100644
index 0000000..9d3c1ba
--- /dev/null
+++ b/compat/thrust/detail/execute_with_allocator.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/util/blocking.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename ToPointer, typename FromPointer>
+__host__ __device__
+ToPointer reinterpret_pointer_cast(FromPointer ptr)
+{
+  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
+  return ToPointer(reinterpret_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+}
+
+
+template<typename Allocator, template <typename> class BaseSystem>
+  struct execute_with_allocator
+    : BaseSystem<execute_with_allocator<Allocator, BaseSystem> >
+{
+  Allocator &m_alloc;
+
+  execute_with_allocator(Allocator &alloc)
+    : m_alloc(alloc)
+  {}
+
+  template<typename T>
+    friend thrust::pair<T*,std::ptrdiff_t>
+      get_temporary_buffer(execute_with_allocator &system, std::ptrdiff_t n)
+  {
+    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+    typedef typename alloc_traits::void_pointer                  void_pointer;
+    typedef typename alloc_traits::size_type                     size_type;
+    typedef typename alloc_traits::value_type                    value_type;
+
+    // how many elements of type value_type do we need to accomodate n elements of type T?
+    size_type num_elements = thrust::detail::util::divide_ri(sizeof(T) * n, sizeof(value_type));
+
+    // allocate that many
+    void_pointer ptr = alloc_traits::allocate(system.m_alloc, num_elements);
+
+    // return the pointer and the number of elements of type T allocated
+    return thrust::make_pair(thrust::detail::reinterpret_pointer_cast<T*>(ptr),n);
+  }
+
+  template<typename Pointer>
+    friend void return_temporary_buffer(execute_with_allocator &system, Pointer p)
+  {
+    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+    typedef typename alloc_traits::pointer                       pointer;
+
+    // return the pointer to the allocator
+    pointer to_ptr = thrust::detail::reinterpret_pointer_cast<pointer>(p);
+    alloc_traits::deallocate(system.m_alloc, to_ptr, 0);
+  }
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/execution_policy.h b/compat/thrust/detail/execution_policy.h
new file mode 100644
index 0000000..28e77f2
--- /dev/null
+++ b/compat/thrust/detail/execution_policy.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// execution_policy_base serves as a guard against
+// inifinite recursion in thrust entry points:
+//
+// template<typename DerivedPolicy>
+// void foo(const thrust::detail::execution_policy_base<DerivedPolicy> &s)
+// {
+//   using thrust::system::detail::generic::foo;
+//
+//   foo(thrust::detail::derived_cast(thrust::detail::strip_const(s));
+// }
+//
+// foo is not recursive when
+// 1. DerivedPolicy is derived from thrust::execution_policy below
+// 2. generic::foo takes thrust::execution_policy as a parameter
+template<typename DerivedPolicy> struct execution_policy_base {};
+
+
+template<typename DerivedPolicy>
+__host__ __device__
+inline execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
+{
+  return const_cast<execution_policy_base<DerivedPolicy>&>(x);
+}
+
+
+template<typename DerivedPolicy>
+__host__ __device__
+inline DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
+{
+  return static_cast<DerivedPolicy&>(x);
+}
+
+
+template<typename DerivedPolicy>
+__host__ __device__
+inline const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
+{
+  return static_cast<const DerivedPolicy&>(x);
+}
+
+
+} // end detail
+
+
+template<typename DerivedPolicy>
+  struct execution_policy
+    : thrust::detail::execution_policy_base<DerivedPolicy>
+{};
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/extrema.inl b/compat/thrust/detail/extrema.inl
new file mode 100644
index 0000000..4bcd0bd
--- /dev/null
+++ b/compat/thrust/detail/extrema.inl
@@ -0,0 +1,160 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/extrema.h>
+#include <thrust/system/detail/adl/extrema.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::min_element;
+  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end min_element()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::min_element;
+  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end min_element()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::max_element;
+  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end max_element()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::max_element;
+  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end max_element()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::minmax_element;
+  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end minmax_element()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::minmax_element;
+  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end minmax_element()
+
+
+template <typename ForwardIterator>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::min_element(select_system(system), first, last);
+} // end min_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::min_element(select_system(system), first, last, comp);
+} // end min_element()
+
+
+template <typename ForwardIterator>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::max_element(select_system(system), first, last);
+} // end max_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::max_element(select_system(system), first, last, comp);
+} // end max_element()
+
+
+template <typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> 
+minmax_element(ForwardIterator first, ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::minmax_element(select_system(system), first, last);
+} // end minmax_element()
+
+
+template <typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> 
+minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::minmax_element(select_system(system), first, last, comp);
+} // end minmax_element()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/fill.inl b/compat/thrust/detail/fill.inl
new file mode 100644
index 0000000..c60e4a0
--- /dev/null
+++ b/compat/thrust/detail/fill.inl
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.inl
+ *  \brief Inline file for fill.h.
+ */
+
+#include <thrust/fill.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/fill.h>
+#include <thrust/system/detail/adl/fill.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  using thrust::system::detail::generic::fill;
+  return fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end fill()
+
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  using thrust::system::detail::generic::fill_n;
+  return fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, value);
+} // end fill_n()
+
+
+template<typename ForwardIterator, typename T>
+  void fill(ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  thrust::fill(select_system(system), first, last, value);
+} // end fill()
+
+
+template<typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<OutputIterator>::type System;
+
+  System system;
+
+  return thrust::fill_n(select_system(system), first, n, value);
+} // end fill()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/find.inl b/compat/thrust/detail/find.inl
new file mode 100644
index 0000000..465c937
--- /dev/null
+++ b/compat/thrust/detail/find.inl
@@ -0,0 +1,109 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.inl
+ *  \brief Inline file for find.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/detail/adl/find.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+  using thrust::system::detail::generic::find;
+  return find(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end find()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::find_if;
+  return find_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end find_if()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::find_if_not;
+  return find_if_not(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end find_if_not()
+
+
+template <typename InputIterator, typename T>
+InputIterator find(InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<InputIterator>::type System;
+
+    System system;
+
+    return thrust::find(select_system(system), first, last, value);
+}
+
+template <typename InputIterator, typename Predicate>
+InputIterator find_if(InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<InputIterator>::type System;
+
+    System system;
+
+    return thrust::find_if(select_system(system), first, last, pred);
+}
+
+template <typename InputIterator, typename Predicate>
+InputIterator find_if_not(InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<InputIterator>::type System;
+
+    System system;
+
+    return thrust::find_if_not(select_system(system), first, last, pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/for_each.inl b/compat/thrust/detail/for_each.inl
new file mode 100644
index 0000000..7c9dc17
--- /dev/null
+++ b/compat/thrust/detail/for_each.inl
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.inl
+ *  \brief Inline file for for_each.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/for_each.h>
+#include <thrust/system/detail/adl/for_each.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+  InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         UnaryFunction f)
+{
+  using thrust::system::detail::generic::for_each;
+
+  return for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, f);
+}
+
+
+template<typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  using thrust::system::detail::generic::select_system;
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+  return thrust::for_each(select_system(system), first, last, f);
+} // end for_each()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename UnaryFunction>
+  InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           Size n,
+                           UnaryFunction f)
+{
+  using thrust::system::detail::generic::for_each_n;
+
+  return for_each_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, f);
+} // end for_each_n()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+  return thrust::for_each_n(select_system(system), first, n, f);
+} // end for_each_n()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/function.h b/compat/thrust/detail/function.h
new file mode 100644
index 0000000..36b76c2
--- /dev/null
+++ b/compat/thrust/detail/function.h
@@ -0,0 +1,226 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_reference_cast.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Function, typename Result>
+  struct host_function
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+
+  inline host_function()
+    : m_f()
+  {}
+
+  inline host_function(const Function &f)
+    : m_f(f)
+  {}
+
+  template<typename Argument>
+    inline Result operator()(Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument>
+    inline Result operator()(const Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline Result operator()(Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline Result operator()(const Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline Result operator()(const Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline Result operator()(Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+}; // end host_function
+
+
+template<typename Function, typename Result>
+  struct device_function
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+
+  inline __device__ device_function()
+    : m_f()
+  {}
+
+  inline __device__ device_function(const Function &f)
+    : m_f(f)
+  {}
+
+  template<typename Argument>
+    inline __device__ Result operator()(Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument>
+    inline __device__ Result operator()(const Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __device__ Result operator()(Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __device__ Result operator()(const Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __device__ Result operator()(Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+}; // end device_function
+
+
+template<typename Function, typename Result>
+  struct host_device_function
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+
+  inline __host__ __device__
+  host_device_function()
+    : m_f()
+  {}
+
+  inline __host__ __device__
+  host_device_function(const Function &f)
+    : m_f(f)
+  {}
+
+  __thrust_hd_warning_disable__
+  template<typename Argument>
+  inline __host__ __device__
+    Result operator()(Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument>
+    inline __host__ __device__ Result operator()(const Argument &x) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __host__ __device__ Result operator()(Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __host__ __device__ Result operator()(const Argument1 &x, Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __host__ __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+
+  template<typename Argument1, typename Argument2>
+    inline __host__ __device__ Result operator()(Argument1 &x, const Argument2 &y) const
+  {
+    // we static cast to Result to handle void Result without error
+    // in case Function's result is non-void
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+  }
+}; // end host_device_function
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional.inl b/compat/thrust/detail/functional.inl
new file mode 100644
index 0000000..4024585
--- /dev/null
+++ b/compat/thrust/detail/functional.inl
@@ -0,0 +1,122 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Operation>
+  struct unary_traits_imp;
+
+template<typename Operation>
+  struct unary_traits_imp<Operation*>
+{
+  typedef Operation                         function_type;
+  typedef const function_type &             param_type;
+  typedef typename Operation::result_type   result_type;
+  typedef typename Operation::argument_type argument_type;
+}; // end unary_traits_imp
+
+template<typename Result, typename Argument>
+  struct unary_traits_imp<Result(*)(Argument)>
+{
+  typedef Result   (*function_type)(Argument);
+  typedef Result   (*param_type)(Argument);
+  typedef Result   result_type;
+  typedef Argument argument_type;
+}; // end unary_traits_imp
+
+template<typename Operation>
+  struct binary_traits_imp;
+
+template<typename Operation>
+  struct binary_traits_imp<Operation*>
+{
+  typedef Operation                                function_type;
+  typedef const function_type &                    param_type;
+  typedef typename Operation::result_type          result_type;
+  typedef typename Operation::first_argument_type  first_argument_type;
+  typedef typename Operation::second_argument_type second_argument_type;
+}; // end binary_traits_imp
+
+template<typename Result, typename Argument1, typename Argument2>
+  struct binary_traits_imp<Result(*)(Argument1, Argument2)>
+{
+  typedef Result (*function_type)(Argument1, Argument2);
+  typedef Result (*param_type)(Argument1, Argument2);
+  typedef Result result_type;
+  typedef Argument1 first_argument_type;
+  typedef Argument2 second_argument_type;
+}; // end binary_traits_imp
+
+} // end detail
+
+template<typename Operation>
+  struct unary_traits
+{
+  typedef typename detail::unary_traits_imp<Operation*>::function_type function_type;
+  typedef typename detail::unary_traits_imp<Operation*>::param_type    param_type;
+  typedef typename detail::unary_traits_imp<Operation*>::result_type   result_type;
+  typedef typename detail::unary_traits_imp<Operation*>::argument_type argument_type;
+}; // end unary_traits
+
+template<typename Result, typename Argument>
+  struct unary_traits<Result(*)(Argument)>
+{
+  typedef Result   (*function_type)(Argument);
+  typedef Result   (*param_type)(Argument);
+  typedef Result   result_type;
+  typedef Argument argument_type;
+}; // end unary_traits
+
+template<typename Operation>
+  struct binary_traits
+{
+  typedef typename detail::binary_traits_imp<Operation*>::function_type        function_type;
+  typedef typename detail::binary_traits_imp<Operation*>::param_type           param_type;
+  typedef typename detail::binary_traits_imp<Operation*>::result_type          result_type;
+  typedef typename detail::binary_traits_imp<Operation*>::first_argument_type  first_argument_type;
+  typedef typename detail::binary_traits_imp<Operation*>::second_argument_type second_argument_type;
+}; // end binary_traits
+
+template<typename Result, typename Argument1, typename Argument2>
+  struct binary_traits<Result(*)(Argument1, Argument2)>
+{
+  typedef Result (*function_type)(Argument1, Argument2);
+  typedef Result (*param_type)(Argument1, Argument2);
+  typedef Result result_type;
+  typedef Argument1 first_argument_type;
+  typedef Argument2 second_argument_type;
+}; // end binary_traits
+
+template<typename Predicate>
+  unary_negate<Predicate> not1(const Predicate &pred)
+{
+  return unary_negate<Predicate>(pred);
+} // end not1()
+
+template<typename BinaryPredicate>
+  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred)
+{
+  return binary_negate<BinaryPredicate>(pred);
+} // end not2()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/actor.h b/compat/thrust/detail/functional/actor.h
new file mode 100644
index 0000000..0b95a6b
--- /dev/null
+++ b/compat/thrust/detail/functional/actor.h
@@ -0,0 +1,192 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/functional/value.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/assignment_operator.h>
+#include <thrust/detail/type_traits/result_of.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename Action, typename Env>
+  struct apply_actor
+{
+  typedef typename Action::template result<Env>::type type;
+};
+
+template<typename Eval>
+  struct actor
+    : Eval
+{
+  typedef Eval eval_type;
+
+  __host__ __device__
+  actor(void);
+
+  __host__ __device__
+  actor(const Eval &base);
+
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::null_type >::type
+  operator()(void) const;
+
+  template<typename T0>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&> >::type
+  operator()(T0 &_0) const;
+
+  template<typename T0, typename T1>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&> >::type
+  operator()(T0 &_0, T1 &_1) const;
+
+  template<typename T0, typename T1, typename T2>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2) const;
+
+  template<typename T0, typename T1, typename T2, typename T3>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const;
+
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+  __host__ __device__
+  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> >::type
+  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const;
+
+  template<typename T>
+  __host__ __device__
+  typename assign_result<Eval,T>::type
+  operator=(const T &_1) const;
+}; // end actor
+
+// in general, as_actor should turn things into values
+template<typename T>
+  struct as_actor
+{
+  typedef value<T> type;
+
+  static inline __host__ __device__ type convert(const T &x)
+  {
+    return val(x);
+  } // end convert()
+}; // end as_actor
+
+// specialization for things which are already actors
+template<typename Eval>
+  struct as_actor<actor<Eval> >
+{
+  typedef actor<Eval> type;
+
+  static inline __host__ __device__ const type &convert(const actor<Eval> &x)
+  {
+    return x;
+  } // end convert()
+}; // end as_actor
+
+template<typename T>
+  typename as_actor<T>::type
+  __host__ __device__
+    make_actor(const T &x)
+{
+  return as_actor<T>::convert(x);
+} // end make_actor()
+
+} // end functional
+
+// provide specializations for result_of for nullary, unary, and binary invocations of actor
+template<typename Eval>
+  struct result_of<
+    thrust::detail::functional::actor<Eval>()
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::null_type
+  >::type type;
+}; // end result_of
+
+template<typename Eval, typename Arg1>
+  struct result_of<
+    thrust::detail::functional::actor<Eval>(Arg1)
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::tuple<Arg1>
+  >::type type;
+}; // end result_of
+
+template<typename Eval, typename Arg1, typename Arg2>
+  struct result_of<
+    thrust::detail::functional::actor<Eval>(Arg1,Arg2)
+  >
+{
+  typedef typename thrust::detail::functional::apply_actor<
+    thrust::detail::functional::actor<Eval>,
+    thrust::tuple<Arg1,Arg2>
+  >::type type;
+}; // end result_of
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/functional/actor.inl>
+
diff --git a/compat/thrust/detail/functional/actor.inl b/compat/thrust/detail/functional/actor.inl
new file mode 100644
index 0000000..84347be
--- /dev/null
+++ b/compat/thrust/detail/functional/actor.inl
@@ -0,0 +1,194 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/assignment_operator.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+namespace functional
+{
+
+template<typename Eval>
+  actor<Eval>
+    ::actor(void)
+      : eval_type()
+{}
+
+template<typename Eval>
+  actor<Eval>
+    ::actor(const Eval &base)
+      : eval_type(base)
+{}
+
+template<typename Eval>
+  typename apply_actor<
+    typename actor<Eval>::eval_type,
+    typename thrust::null_type
+  >::type
+    actor<Eval>
+      ::operator()(void) const
+{
+  return eval_type::eval(thrust::null_type());
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0) const
+{
+  return eval_type::eval(thrust::tie(_0));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1) const
+{
+  return eval_type::eval(thrust::tie(_0,_1));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    typename apply_actor<
+      typename actor<Eval>::eval_type,
+      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>
+    >::type
+      actor<Eval>
+        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const
+{
+  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9));
+} // end basic_environment::operator()
+
+template<typename Eval>
+  template<typename T>
+    typename assign_result<Eval,T>::type
+      actor<Eval>
+        ::operator=(const T& _1) const
+{
+  return do_assign(*this,_1);
+} // end actor::operator=()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/argument.h b/compat/thrust/detail/functional/argument.h
new file mode 100644
index 0000000..96a20be
--- /dev/null
+++ b/compat/thrust/detail/functional/argument.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<unsigned int i, typename Env>
+  struct argument_helper
+{
+  typedef typename thrust::tuple_element<i,Env>::type type;
+};
+
+template<unsigned int i>
+  struct argument_helper<i,thrust::null_type>
+{
+  typedef thrust::null_type type;
+};
+
+
+template<unsigned int i>
+  class argument
+{
+  public:
+    template<typename Env>
+      struct result
+        : argument_helper<i,Env>
+    {
+    };
+
+    __host__ __device__
+    argument(void){}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type eval(const Env &e) const
+    {
+      return thrust::get<i>(e);
+    } // end eval()
+}; // end argument
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/composite.h b/compat/thrust/detail/functional/composite.h
new file mode 100644
index 0000000..1d5fde3
--- /dev/null
+++ b/compat/thrust/detail/functional/composite.h
@@ -0,0 +1,163 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/functional/actor.h>
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// XXX we should just take a single EvalTuple
+template<typename Eval0,
+         typename Eval1  = thrust::null_type,
+         typename Eval2  = thrust::null_type,
+         typename Eval3  = thrust::null_type,
+         typename Eval4  = thrust::null_type,
+         typename Eval5  = thrust::null_type,
+         typename Eval6  = thrust::null_type,
+         typename Eval7  = thrust::null_type,
+         typename Eval8  = thrust::null_type,
+         typename Eval9  = thrust::null_type,
+         typename Eval10 = thrust::null_type>
+  class composite;
+
+template<typename Eval0, typename Eval1>
+  class composite<
+    Eval0,
+    Eval1,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type
+  >
+{
+  public:
+    template<typename Env>
+      struct result
+    {
+      typedef typename Eval0::template result<
+        thrust::tuple<
+          typename Eval1::template result<Env>::type
+        >
+      >::type type;
+    };
+
+    __host__ __device__
+    composite(const Eval0 &e0, const Eval1 &e1)
+      : m_eval0(e0),
+        m_eval1(e1)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type
+    eval(const Env &x) const
+    {
+      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
+      return m_eval0.eval(thrust::tie(result1));
+    }
+
+  private:
+    Eval0 m_eval0;
+    Eval1 m_eval1;
+}; // end composite<Eval0,Eval1>
+
+template<typename Eval0, typename Eval1, typename Eval2>
+  class composite<
+    Eval0,
+    Eval1,
+    Eval2,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type,
+    thrust::null_type
+  >
+{
+  public:
+    template<typename Env>
+      struct result
+    {
+      typedef typename Eval0::template result<
+        thrust::tuple<
+          typename Eval1::template result<Env>::type,
+          typename Eval2::template result<Env>::type
+        >
+      >::type type;
+    };
+
+    __host__ __device__
+    composite(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
+      : m_eval0(e0),
+        m_eval1(e1),
+        m_eval2(e2)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+    typename result<Env>::type
+    eval(const Env &x) const
+    {
+      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
+      typename Eval2::template result<Env>::type result2 = m_eval2.eval(x);
+      return m_eval0.eval(thrust::tie(result1,result2));
+    }
+
+  private:
+    Eval0 m_eval0;
+    Eval1 m_eval1;
+    Eval2 m_eval2;
+}; // end composite<Eval0,Eval1,Eval2>
+
+template<typename Eval0, typename Eval1>
+__host__ __device__
+  actor<composite<Eval0,Eval1> > compose(const Eval0 &e0, const Eval1 &e1)
+{
+  return actor<composite<Eval0,Eval1> >(composite<Eval0,Eval1>(e0,e1));
+}
+
+template<typename Eval0, typename Eval1, typename Eval2>
+__host__ __device__
+  actor<composite<Eval0,Eval1,Eval2> > compose(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
+{
+  return actor<composite<Eval0,Eval1,Eval2> >(composite<Eval0,Eval1,Eval2>(e0,e1,e2));
+}
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators.h b/compat/thrust/detail/functional/operators.h
new file mode 100644
index 0000000..0fc3539
--- /dev/null
+++ b/compat/thrust/detail/functional/operators.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/operators/arithmetic_operators.h>
+#include <thrust/detail/functional/operators/relational_operators.h>
+#include <thrust/detail/functional/operators/logical_operators.h>
+#include <thrust/detail/functional/operators/bitwise_operators.h>
+#include <thrust/detail/functional/operators/compound_assignment_operators.h>
+
diff --git a/compat/thrust/detail/functional/operators/arithmetic_operators.h b/compat/thrust/detail/functional/operators/arithmetic_operators.h
new file mode 100644
index 0000000..a11e7ac
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/arithmetic_operators.h
@@ -0,0 +1,394 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<thrust::negate>,
+    actor<Eval>
+  >
+>
+__host__ __device__
+operator-(const actor<Eval> &_1)
+{
+  return compose(unary_operator<thrust::negate>(), _1);
+} // end operator-()
+
+// there's no standard unary_plus functional, so roll an ad hoc one here
+template<typename T>
+  struct unary_plus
+    : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const {return +x;}
+}; // end unary_plus
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<unary_plus>,
+    actor<Eval>
+  >
+>
+operator+(const actor<Eval> &_1)
+{
+  return compose(unary_operator<unary_plus>(), _1);
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::plus>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator+(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::plus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::plus>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator+(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::plus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::plus>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator+(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::plus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::minus>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator-(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::minus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::minus>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator-(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::minus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::minus>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator-(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::minus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::multiplies>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator*(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::multiplies>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::multiplies>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator*(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::multiplies>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::multiplies>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator*(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::multiplies>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::divides>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator/(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::divides>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::divides>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator/(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::divides>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::divides>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator/(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::divides>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::modulus>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator%(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::modulus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::modulus>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator%(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::modulus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::modulus>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator%(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::modulus>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%()
+
+// there's no standard prefix_increment functional, so roll an ad hoc one here
+template<typename T>
+  struct prefix_increment
+    : public thrust::unary_function<T&,T&>
+{
+  __host__ __device__ T& operator()(T &x) const { return ++x; }
+}; // end prefix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<prefix_increment>,
+    actor<Eval>
+  >
+>
+operator++(const actor<Eval> &_1)
+{
+  return compose(unary_operator<prefix_increment>(), _1);
+} // end operator++()
+
+// there's no standard suffix_increment functional, so roll an ad hoc one here
+template<typename T>
+  struct suffix_increment
+    : public thrust::unary_function<T&,T>
+{
+  __host__ __device__ T operator()(T &x) const { return x++; }
+}; // end suffix_increment
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<suffix_increment>,
+    actor<Eval>
+  >
+>
+operator++(const actor<Eval> &_1, int)
+{
+  return compose(unary_operator<suffix_increment>(), _1);
+} // end operator++()
+
+// there's no standard prefix_decrement functional, so roll an ad hoc one here
+template<typename T>
+  struct prefix_decrement
+    : public thrust::unary_function<T&,T&>
+{
+  __host__ __device__ T& operator()(T &x) const { return --x; }
+}; // end prefix_decrement
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<prefix_decrement>,
+    actor<Eval>
+  >
+>
+operator--(const actor<Eval> &_1)
+{
+  return compose(unary_operator<prefix_decrement>(), _1);
+} // end operator--()
+
+// there's no standard suffix_decrement functional, so roll an ad hoc one here
+template<typename T>
+  struct suffix_decrement
+    : public thrust::unary_function<T&,T>
+{
+  __host__ __device__ T operator()(T &x) const { return x--; }
+}; // end suffix_decrement
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<suffix_decrement>,
+    actor<Eval>
+  >
+>
+operator--(const actor<Eval> &_1, int)
+{
+  return compose(unary_operator<suffix_decrement>(), _1);
+} // end operator--()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/assignment_operator.h b/compat/thrust/detail/functional/operators/assignment_operator.h
new file mode 100644
index 0000000..e5d6620
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/assignment_operator.h
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+// XXX WAR circular inclusion with this forward declaration
+template<typename,typename,typename> struct binary_function;
+
+namespace detail
+{
+namespace functional
+{
+
+// XXX WAR circular inclusion with this forward declaration
+template<typename> struct as_actor;
+
+// there's no standard assign functional, so roll an ad hoc one here
+template<typename T>
+  struct assign
+    : thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs = rhs; }
+}; // end assign
+
+template<typename Eval, typename T>
+  struct assign_result
+{
+  typedef actor<
+    composite<
+      binary_operator<assign>,
+      actor<Eval>,
+      typename as_actor<T>::type
+    >
+  > type;
+}; // end assign_result
+
+template<typename Eval, typename T>
+  __host__ __device__
+    typename assign_result<Eval,T>::type
+      do_assign(const actor<Eval> &_1, const T &_2)
+{
+  return compose(binary_operator<assign>(),
+                 _1,
+                 as_actor<T>::convert(_2));
+} // end do_assign()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/bitwise_operators.h b/compat/thrust/detail/functional/operators/bitwise_operators.h
new file mode 100644
index 0000000..c89c5d4
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/bitwise_operators.h
@@ -0,0 +1,313 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_and>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::bit_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_and>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator&(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_and>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_or>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator|(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::bit_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_or>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator|(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_or>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator|(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_xor>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator^(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::bit_xor>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_xor>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator^(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_xor>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::bit_xor>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator^(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::bit_xor>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator^()
+
+// there's no standard bit_not functional, so roll an ad hoc one here
+template<typename T>
+  struct bit_not
+    : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const {return ~x;}
+}; // end bit_not
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<bit_not>,
+    actor<Eval>
+  >
+>
+__host__ __device__
+operator~(const actor<Eval> &_1)
+{
+  return compose(unary_operator<bit_not>(), _1);
+} // end operator~()
+
+// there's no standard bit_lshift functional, so roll an ad hoc one here
+template<typename T>
+  struct bit_lshift
+    : public thrust::binary_function<T,T,T>
+{
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs << rhs;}
+}; // end bit_lshift
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_lshift>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<<(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_lshift>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<<(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_lshift>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<<(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_lshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<()
+
+// there's no standard bit_rshift functional, so roll an ad hoc one here
+template<typename T>
+  struct bit_rshift
+    : public thrust::binary_function<T,T,T>
+{
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs >> rhs;}
+}; // end bit_rshift
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_rshift>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>>(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_rshift>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>>(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_rshift>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>>(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_rshift>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/compound_assignment_operators.h b/compat/thrust/detail/functional/operators/compound_assignment_operators.h
new file mode 100644
index 0000000..ef7389b
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/compound_assignment_operators.h
@@ -0,0 +1,424 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T>
+  struct plus_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs += rhs; }
+}; // end plus_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<plus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator+=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<plus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<plus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator+=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<plus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator+=()
+
+template<typename T>
+  struct minus_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs -= rhs; }
+}; // end minus_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<minus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator-=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<minus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<minus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator-=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<minus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator-=()
+
+template<typename T>
+  struct multiplies_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs *= rhs; }
+}; // end multiplies_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<multiplies_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator*=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<multiplies_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<multiplies_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator*=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<multiplies_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator*=()
+
+template<typename T>
+  struct divides_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs /= rhs; }
+}; // end divides_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<divides_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator/=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<divides_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<divides_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator/=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<divides_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator/=()
+
+template<typename T>
+  struct modulus_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs %= rhs; }
+}; // end modulus_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<modulus_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator%=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<modulus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<modulus_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator%=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<modulus_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator%=()
+
+template<typename T>
+  struct bit_and_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs &= rhs; }
+}; // end bit_and_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_and_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_and_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_and_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_and_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&=()
+
+template<typename T>
+  struct bit_or_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs |= rhs; }
+}; // end bit_or_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_or_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator|=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_or_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_or_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator|=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_or_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T>
+  struct bit_xor_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs ^= rhs; }
+}; // end bit_xor_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_xor_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator^=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_xor_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_xor_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator^=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_xor_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator|=()
+
+template<typename T>
+  struct bit_lshift_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs <<= rhs; }
+}; // end bit_lshift_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_lshift_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<<=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_lshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_lshift_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<<=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_lshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<<=()
+
+template<typename T>
+  struct bit_rshift_equal
+    : public thrust::binary_function<T&,T,T&>
+{
+  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs >>= rhs; }
+}; // end bit_rshift_equal
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_rshift_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>>=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<bit_rshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<bit_rshift_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>>=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<bit_rshift_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>>=()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/logical_operators.h b/compat/thrust/detail/functional/operators/logical_operators.h
new file mode 100644
index 0000000..9c95262
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/logical_operators.h
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_and>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator&&(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::logical_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_and>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator&&(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::logical_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_and>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator&&(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::logical_and>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_or>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator||(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::logical_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_or>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator||(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::logical_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::logical_or>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator||(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::logical_or>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator&&()
+
+template<typename Eval>
+__host__ __device__
+actor<
+  composite<
+    unary_operator<thrust::logical_not>,
+    actor<Eval>
+  >
+>
+operator!(const actor<Eval> &_1)
+{
+  return compose(unary_operator<thrust::logical_not>(), _1);
+} // end operator!()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/operator_adaptors.h b/compat/thrust/detail/functional/operators/operator_adaptors.h
new file mode 100644
index 0000000..d35fe97
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/operator_adaptors.h
@@ -0,0 +1,115 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+// this thing (which models Eval) is an adaptor for the unary
+// functors inside functional.h
+template<template<typename> class UnaryOperator>
+  struct unary_operator
+{
+  template<typename Env>
+    struct argument
+      : thrust::detail::eval_if<
+          (thrust::tuple_size<Env>::value == 0),
+          thrust::detail::identity_<thrust::null_type>,
+          thrust::tuple_element<0,Env>
+        >
+  {
+  };
+
+  template<typename Env>
+    struct operator_type
+  {
+    typedef UnaryOperator<
+      typename thrust::detail::remove_reference<
+        typename argument<Env>::type
+      >::type
+    > type;
+  };
+
+  template<typename Env>
+    struct result
+  {
+    typedef typename operator_type<Env>::type op_type;
+    typedef typename op_type::result_type type;
+  };
+
+  template<typename Env>
+  __host__ __device__
+  typename result<Env>::type eval(const Env &e) const
+  {
+    typename operator_type<Env>::type op;
+    return op(thrust::get<0>(e));
+  } // end eval()
+}; // end unary_operator
+
+// this thing (which models Eval) is an adaptor for the binary
+// functors inside functional.h
+template<template<typename> class BinaryOperator>
+  struct binary_operator
+{
+  template<typename Env>
+    struct first_argument
+      : thrust::detail::eval_if<
+          (thrust::tuple_size<Env>::value == 0),
+          thrust::detail::identity_<thrust::null_type>,
+          thrust::tuple_element<0,Env>
+        >
+  {
+  };
+
+  template<typename Env>
+    struct operator_type
+  {
+    typedef BinaryOperator<
+      typename thrust::detail::remove_reference<
+        typename first_argument<Env>::type
+      >::type
+    > type;
+  };
+
+  template<typename Env>
+    struct result
+  {
+    typedef typename operator_type<Env>::type op_type;
+    typedef typename op_type::result_type type;
+  };
+
+  template<typename Env>
+  __host__ __device__
+  typename result<Env>::type eval(const Env &e) const
+  {
+    typename operator_type<Env>::type op;
+    return op(thrust::get<0>(e), thrust::get<1>(e));
+  } // end eval()
+}; // end binary_operator
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/operators/relational_operators.h b/compat/thrust/detail/functional/operators/relational_operators.h
new file mode 100644
index 0000000..6b26534
--- /dev/null
+++ b/compat/thrust/detail/functional/operators/relational_operators.h
@@ -0,0 +1,323 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/composite.h>
+#include <thrust/detail/functional/operators/operator_adaptors.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::equal_to>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator==(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::equal_to>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator==(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::equal_to>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator==(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator==()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::not_equal_to>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator!=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::not_equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::not_equal_to>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator!=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::not_equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::not_equal_to>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator!=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::not_equal_to>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator!=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::greater>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::greater>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::greater>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::less>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::less>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::less>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator>=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::greater_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater_equal>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator>=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::greater_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::greater_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator>=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::greater_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator>=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less_equal>,
+    actor<T1>,
+    typename as_actor<T2>::type
+  >
+>
+operator<=(const actor<T1> &_1, const T2 &_2)
+{
+  return compose(binary_operator<thrust::less_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less_equal>,
+    typename as_actor<T1>::type,
+    actor<T2>
+  >
+>
+operator<=(const T1 &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::less_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+template<typename T1, typename T2>
+__host__ __device__
+actor<
+  composite<
+    binary_operator<thrust::less_equal>,
+    actor<T1>,
+    actor<T2>
+  >
+>
+operator<=(const actor<T1> &_1, const actor<T2> &_2)
+{
+  return compose(binary_operator<thrust::less_equal>(),
+                 make_actor(_1),
+                 make_actor(_2));
+} // end operator<=()
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/placeholder.h b/compat/thrust/detail/functional/placeholder.h
new file mode 100644
index 0000000..9acf6da
--- /dev/null
+++ b/compat/thrust/detail/functional/placeholder.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+#include <thrust/detail/functional/argument.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+template<unsigned int i>
+  struct placeholder
+{
+  typedef actor<argument<i> > type;
+};
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/functional/value.h b/compat/thrust/detail/functional/value.h
new file mode 100644
index 0000000..27e2802
--- /dev/null
+++ b/compat/thrust/detail/functional/value.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// Portions of this code are derived from
+//
+// Manjunath Kudlur's Carbon library
+//
+// and
+//
+// Based on Boost.Phoenix v1.2
+// Copyright (c) 2001-2002 Joel de Guzman
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/functional/actor.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace functional
+{
+
+
+template<typename Eval> struct actor;
+
+
+template<typename T>
+  class value
+{
+  public:
+
+    template<typename Env>
+      struct result
+    {
+      typedef T type;
+    };
+
+    __host__ __device__
+    value(const T &arg)
+      : m_val(arg)
+    {}
+
+    template<typename Env>
+    __host__ __device__
+      T eval(const Env &) const
+    {
+      return m_val;
+    }
+
+  private:
+    T m_val;
+}; // end value
+
+template<typename T>
+__host__ __device__
+actor<value<T> > val(const T &x)
+{
+  return value<T>(x);
+} // end val()
+
+
+} // end functional
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/gather.inl b/compat/thrust/detail/gather.inl
new file mode 100644
index 0000000..4edecd0
--- /dev/null
+++ b/compat/thrust/detail/gather.inl
@@ -0,0 +1,160 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file gather.inl
+ *  \brief Inline file for gather.h.
+ */
+
+#include <thrust/gather.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/gather.h>
+#include <thrust/system/detail/adl/gather.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator                                               map_first,
+                        InputIterator                                               map_last,
+                        RandomAccessIterator                                        input_first,
+                        OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::gather;
+  return gather(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, input_first, result);
+} // end gather()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::gather_if;
+  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result);
+} // end gather_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result,
+                           Predicate                                                   pred)
+{
+  using thrust::system::detail::generic::gather_if;
+  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result, pred);
+} // end gather_if()
+
+
+template<typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(InputIterator        map_first,
+                        InputIterator        map_last,
+                        RandomAccessIterator input_first,
+                        OutputIterator       result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type        System1; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System3; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::gather(select_system(system1,system2,system3), map_first, map_last, input_first, result);
+} // end gather()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result);
+} // end gather_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result,
+                           Predicate            pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result, pred);
+} // end gather_if()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/generate.inl b/compat/thrust/detail/generate.inl
new file mode 100644
index 0000000..c125804
--- /dev/null
+++ b/compat/thrust/detail/generate.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generate.inl
+ *  \author Jared Hoberock
+ *  \brief Inline file for generate.h.
+ */
+
+#include <thrust/generate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/generate.h>
+#include <thrust/system/detail/adl/generate.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Generator>
+  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  using thrust::system::detail::generic::generate;
+  return generate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, gen);
+} // end generate()
+
+
+template<typename DerivedPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  using thrust::system::detail::generic::generate_n;
+  return generate_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, gen);
+} // end generate_n()
+
+
+template<typename ForwardIterator,
+         typename Generator>
+  void generate(ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::generate(select_system(system), first, last, gen);
+} // end generate()
+
+
+template<typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<OutputIterator>::type System;
+
+  System system;
+
+  return thrust::generate_n(select_system(system), first, n, gen);
+} // end generate_n()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/host_vector.inl b/compat/thrust/detail/host_vector.inl
new file mode 100644
index 0000000..e5c60ab
--- /dev/null
+++ b/compat/thrust/detail/host_vector.inl
@@ -0,0 +1,37 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file host_vector.inl
+ *  \brief Inline file for host_vector.h.
+ */
+
+#include <thrust/host_vector.h>
+
+namespace thrust
+{
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    host_vector<T,Alloc>
+      ::host_vector(const device_vector<OtherT,OtherAlloc> &v)
+        :Parent(v)
+{
+  ;
+} // end host_vector::host_vector()
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/inner_product.inl b/compat/thrust/detail/inner_product.inl
new file mode 100644
index 0000000..f7773d8
--- /dev/null
+++ b/compat/thrust/detail/inner_product.inl
@@ -0,0 +1,104 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file inner_product.inl
+ *  \brief Inline file for inner_product.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/inner_product.h>
+#include <thrust/system/detail/adl/inner_product.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType>
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init)
+{
+  using thrust::system::detail::generic::inner_product;
+  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init);
+} // end inner_product()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType,
+         typename BinaryFunction1,
+         typename BinaryFunction2>
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2)
+{
+  using thrust::system::detail::generic::inner_product;
+  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+template <typename InputIterator1, typename InputIterator2, typename OutputType>
+OutputType 
+inner_product(InputIterator1 first1, InputIterator1 last1,
+              InputIterator2 first2, OutputType init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init);
+} // end inner_product()
+
+
+template <typename InputIterator1, typename InputIterator2, typename OutputType,
+          typename BinaryFunction1, typename BinaryFunction2>
+OutputType
+inner_product(InputIterator1 first1, InputIterator1 last1,
+              InputIterator2 first2, OutputType init, 
+              BinaryFunction1 binary_op1, BinaryFunction2 binary_op2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/integer_traits.h b/compat/thrust/detail/integer_traits.h
new file mode 100644
index 0000000..e4cf5d1
--- /dev/null
+++ b/compat/thrust/detail/integer_traits.h
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <limits>
+#include <limits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T>
+  class integer_traits
+{
+  public:
+    static const bool is_integral = false;
+};
+
+template<typename T, T min_val, T max_val>
+  class integer_traits_base
+{
+  public:
+    static const bool is_integral = true;
+    static const T const_min = min_val;
+    static const T const_max = max_val;
+};
+
+
+template<>
+  class integer_traits<bool>
+    : public std::numeric_limits<bool>,
+      public integer_traits_base<bool, false, true>
+{};
+
+
+template<>
+  class integer_traits<char>
+    : public std::numeric_limits<char>,
+      public integer_traits_base<char, CHAR_MIN, CHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<signed char>
+    : public std::numeric_limits<signed char>,
+      public integer_traits_base<signed char, SCHAR_MIN, SCHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned char>
+    : public std::numeric_limits<unsigned char>,
+      public integer_traits_base<unsigned char, 0, UCHAR_MAX>
+{};
+
+
+template<>
+  class integer_traits<short>
+    : public std::numeric_limits<short>,
+      public integer_traits_base<short, SHRT_MIN, SHRT_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned short>
+    : public std::numeric_limits<unsigned short>,
+      public integer_traits_base<unsigned short, 0, USHRT_MAX>
+{};
+
+
+template<>
+  class integer_traits<int>
+    : public std::numeric_limits<int>,
+      public integer_traits_base<int, INT_MIN, INT_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned int>
+    : public std::numeric_limits<unsigned int>,
+      public integer_traits_base<unsigned int, 0, UINT_MAX>
+{};
+
+
+template<>
+  class integer_traits<long>
+    : public std::numeric_limits<long>,
+      public integer_traits_base<long, LONG_MIN, LONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned long>
+    : public std::numeric_limits<unsigned long>,
+      public integer_traits_base<unsigned long, 0, ULONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<long long>
+    : public std::numeric_limits<long long>,
+      public integer_traits_base<long long, LLONG_MIN, LLONG_MAX>
+{};
+
+
+template<>
+  class integer_traits<unsigned long long>
+    : public std::numeric_limits<unsigned long long>,
+      public integer_traits_base<unsigned long long, 0, ULLONG_MAX>
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/internal_functional.h b/compat/thrust/detail/internal_functional.h
new file mode 100644
index 0000000..6d5264a
--- /dev/null
+++ b/compat/thrust/detail/internal_functional.h
@@ -0,0 +1,678 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file internal_functional.inl
+ *  \brief Non-public functionals used to implement algorithm internals.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <memory> // for ::new
+
+namespace thrust
+{
+namespace detail
+{
+
+// unary_negate does not need to know argument_type
+template <typename Predicate>
+struct unary_negate
+{
+    typedef bool result_type;
+
+    Predicate pred;
+
+    __host__ __device__
+    explicit unary_negate(const Predicate& pred) : pred(pred) {}
+
+    template <typename T>
+    __host__ __device__
+    bool operator()(const T& x)
+    {
+        return !bool(pred(x));
+    }
+};
+
+// binary_negate does not need to know first_argument_type or second_argument_type
+template <typename Predicate>
+struct binary_negate
+{
+    typedef bool result_type;
+
+    Predicate pred;
+
+    __host__ __device__
+    explicit binary_negate(const Predicate& pred) : pred(pred) {}
+
+    template <typename T1, typename T2>
+        __host__ __device__
+        bool operator()(const T1& x, const T2& y)
+        {
+            return !bool(pred(x,y));
+        }
+};
+
+template<typename Predicate>
+  __host__ __device__
+  thrust::detail::unary_negate<Predicate> not1(const Predicate &pred)
+{
+    return thrust::detail::unary_negate<Predicate>(pred);
+}
+
+template<typename Predicate>
+  __host__ __device__
+  thrust::detail::binary_negate<Predicate> not2(const Predicate &pred)
+{
+    return thrust::detail::binary_negate<Predicate>(pred);
+}
+
+
+// convert a predicate to a 0 or 1 integral value
+template <typename Predicate, typename IntegralType>
+struct predicate_to_integral
+{
+    Predicate pred;
+
+    __host__ __device__
+    explicit predicate_to_integral(const Predicate& pred) : pred(pred) {}
+
+    template <typename T>
+        __host__ __device__
+        bool operator()(const T& x)
+        {
+            return pred(x) ? IntegralType(1) : IntegralType(0);
+        }
+};
+
+
+// note that detail::equal_to does not force conversion from T2 -> T1 as equal_to does
+template <typename T1>
+struct equal_to
+{
+    typedef bool result_type;
+
+    template <typename T2>
+        __host__ __device__
+        bool operator()(const T1& lhs, const T2& rhs) const
+        {
+            return lhs == rhs;
+        }
+};
+
+// note that equal_to_value does not force conversion from T2 -> T1 as equal_to does
+template <typename T2>
+struct equal_to_value
+{
+    T2 rhs;
+
+    equal_to_value(const T2& rhs) : rhs(rhs) {}
+
+    template <typename T1>
+        __host__ __device__
+        bool operator()(const T1& lhs) const
+        {
+            return lhs == rhs;
+        }
+};
+
+template <typename Predicate>
+struct tuple_binary_predicate
+{
+    typedef bool result_type;
+
+    __host__ __device__
+        tuple_binary_predicate(const Predicate& p) : pred(p) {}
+
+    template<typename Tuple>
+        __host__ __device__
+        bool operator()(const Tuple& t) const
+        { 
+            return pred(thrust::get<0>(t), thrust::get<1>(t));
+        }
+
+    Predicate pred;
+};
+
+template <typename Predicate>
+struct tuple_not_binary_predicate
+{
+    typedef bool result_type;
+
+    __host__ __device__
+        tuple_not_binary_predicate(const Predicate& p) : pred(p) {}
+
+    template<typename Tuple>
+        __host__ __device__
+        bool operator()(const Tuple& t) const
+        { 
+            return !pred(thrust::get<0>(t), thrust::get<1>(t));
+        }
+
+    Predicate pred;
+};
+
+template<typename Generator>
+  struct host_generate_functor
+{
+  typedef void result_type;
+
+  __host__ __device__
+  host_generate_functor(Generator g)
+    : gen(g) {}
+
+  // operator() does not take an lvalue reference because some iterators
+  // produce temporary proxy references when dereferenced. for example,
+  // consider the temporary tuple of references produced by zip_iterator.
+  // such temporaries cannot bind to an lvalue reference.
+  //
+  // to WAR this, accept a const reference (which is bindable to a temporary),
+  // and const_cast in the implementation.
+  //
+  // XXX change to an rvalue reference upon c++0x (which either a named variable
+  //     or temporary can bind to)
+  template<typename T>
+  __host__
+  void operator()(const T &x)
+  {
+    // we have to be naughty and const_cast this to get it to work
+    T &lvalue = const_cast<T&>(x);
+
+    // this assigns correctly whether x is a true reference or proxy
+    lvalue = gen();
+  }
+
+  Generator gen;
+};
+
+template<typename Generator>
+  struct device_generate_functor
+{
+  typedef void result_type;
+
+  __host__ __device__
+  device_generate_functor(Generator g)
+    : gen(g) {}
+
+  // operator() does not take an lvalue reference because some iterators
+  // produce temporary proxy references when dereferenced. for example,
+  // consider the temporary tuple of references produced by zip_iterator.
+  // such temporaries cannot bind to an lvalue reference.
+  //
+  // to WAR this, accept a const reference (which is bindable to a temporary),
+  // and const_cast in the implementation.
+  //
+  // XXX change to an rvalue reference upon c++0x (which either a named variable
+  //     or temporary can bind to)
+  template<typename T>
+  __host__ __device__
+  void operator()(const T &x)
+  {
+    // we have to be naughty and const_cast this to get it to work
+    T &lvalue = const_cast<T&>(x);
+
+    // this assigns correctly whether x is a true reference or proxy
+    lvalue = gen();
+  }
+
+  Generator gen;
+};
+
+template<typename System, typename Generator>
+  struct generate_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_generate_functor<Generator> >,
+        thrust::detail::identity_<device_generate_functor<Generator> >
+      >
+{};
+
+
+template<typename ResultType, typename BinaryFunction>
+  struct zipped_binary_op
+{
+  typedef ResultType result_type;
+
+  __host__ __device__
+  zipped_binary_op(BinaryFunction binary_op)
+    : m_binary_op(binary_op) {}
+
+  template<typename Tuple>
+  __host__ __device__
+  inline result_type operator()(Tuple t)
+  {
+    return m_binary_op(thrust::get<0>(t), thrust::get<1>(t));
+  }
+
+  BinaryFunction m_binary_op;
+};
+
+
+template<typename T>
+  struct is_non_const_reference
+    : thrust::detail::and_<
+        thrust::detail::not_<thrust::detail::is_const<T> >,
+        thrust::detail::is_reference<T>
+      >
+{};
+
+template<typename T> struct is_tuple_of_iterator_references : thrust::detail::false_type {};
+
+template<typename T1, typename T2, typename T3,
+         typename T4, typename T5, typename T6,
+         typename T7, typename T8, typename T9,
+         typename T10>
+  struct is_tuple_of_iterator_references<
+    thrust::detail::tuple_of_iterator_references<
+      T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
+    >
+  >
+    : thrust::detail::true_type
+{};
+
+// use this enable_if to avoid assigning to temporaries in the transform functors below
+// XXX revisit this problem with c++11 perfect forwarding
+template<typename T>
+  struct enable_if_non_const_reference_or_tuple_of_iterator_references
+    : thrust::detail::enable_if<
+        is_non_const_reference<T>::value || is_tuple_of_iterator_references<T>::value
+      >
+{};
+
+
+template<typename UnaryFunction>
+  struct host_unary_transform_functor
+{
+  typedef void result_type;
+
+  UnaryFunction f;
+
+  host_unary_transform_functor(UnaryFunction f_)
+    :f(f_) {}
+
+  template<typename Tuple>
+  inline __host__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    thrust::get<1>(t) = f(thrust::get<0>(t));
+  }
+};
+
+template<typename UnaryFunction>
+  struct device_unary_transform_functor
+{
+  typedef void result_type;
+
+  UnaryFunction f;
+
+  device_unary_transform_functor(UnaryFunction f_)
+    :f(f_) {}
+
+  // add __host__ to allow the omp backend compile with nvcc
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    thrust::get<1>(t) = f(thrust::get<0>(t));
+  }
+};
+
+
+template<typename System, typename UnaryFunction>
+  struct unary_transform_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_unary_transform_functor<UnaryFunction> >,
+        thrust::detail::identity_<device_unary_transform_functor<UnaryFunction> >
+      >
+{};
+
+
+template <typename BinaryFunction>
+  struct host_binary_transform_functor
+{
+  BinaryFunction f;
+
+  host_binary_transform_functor(BinaryFunction f_)
+    :f(f_)
+  {}
+
+  template <typename Tuple>
+  __host__
+  void operator()(Tuple t)
+  { 
+    thrust::get<2>(t) = f(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end binary_transform_functor
+
+
+template <typename BinaryFunction>
+  struct device_binary_transform_functor
+{
+  BinaryFunction f;
+
+  device_binary_transform_functor(BinaryFunction f_)
+    :f(f_)
+  {}
+
+  // add __host__ to allow the omp backend compile with nvcc
+  template <typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<2,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  { 
+    thrust::get<2>(t) = f(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end binary_transform_functor
+
+
+template<typename System, typename BinaryFunction>
+  struct binary_transform_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_binary_transform_functor<BinaryFunction> >,
+        thrust::detail::identity_<device_binary_transform_functor<BinaryFunction> >
+      >
+{};
+
+
+template <typename UnaryFunction, typename Predicate>
+struct host_unary_transform_if_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+
+  host_unary_transform_if_functor(UnaryFunction unary_op_, Predicate pred_)
+    : unary_op(unary_op_), pred(pred_) {}
+
+  template<typename Tuple>
+  inline __host__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<0>(t)))
+    {
+      thrust::get<1>(t) = unary_op(thrust::get<0>(t));
+    }
+  }
+}; // end host_unary_transform_if_functor
+
+
+template <typename UnaryFunction, typename Predicate>
+struct device_unary_transform_if_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+
+  device_unary_transform_if_functor(UnaryFunction unary_op_, Predicate pred_)
+    : unary_op(unary_op_), pred(pred_) {}
+
+  template<typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<1,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<0>(t)))
+    {
+      thrust::get<1>(t) = unary_op(thrust::get<0>(t));
+    }
+  }
+}; // end device_unary_transform_if_functor
+
+
+template<typename System, typename UnaryFunction, typename Predicate>
+  struct unary_transform_if_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_unary_transform_if_functor<UnaryFunction,Predicate> >,
+        thrust::detail::identity_<device_unary_transform_if_functor<UnaryFunction,Predicate> >
+      >
+{};
+
+
+template <typename UnaryFunction, typename Predicate>
+struct host_unary_transform_if_with_stencil_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+  
+  host_unary_transform_if_with_stencil_functor(UnaryFunction _unary_op, Predicate _pred)
+    : unary_op(_unary_op), pred(_pred) {} 
+  
+  template <typename Tuple>
+  inline __host__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<2,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<1>(t)))
+      thrust::get<2>(t) = unary_op(thrust::get<0>(t));
+  }
+}; // end host_unary_transform_if_with_stencil_functor
+
+
+template <typename UnaryFunction, typename Predicate>
+struct device_unary_transform_if_with_stencil_functor
+{
+  UnaryFunction unary_op;
+  Predicate pred;
+  
+  device_unary_transform_if_with_stencil_functor(UnaryFunction _unary_op, Predicate _pred)
+    : unary_op(_unary_op), pred(_pred) {} 
+  
+  // add __host__ to allow the omp backend compile with nvcc
+  template <typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<2,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<1>(t)))
+      thrust::get<2>(t) = unary_op(thrust::get<0>(t));
+  }
+}; // end device_unary_transform_if_with_stencil_functor
+
+
+template<typename System, typename UnaryFunction, typename Predicate>
+  struct unary_transform_if_with_stencil_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_unary_transform_if_with_stencil_functor<UnaryFunction,Predicate> >,
+        thrust::detail::identity_<device_unary_transform_if_with_stencil_functor<UnaryFunction,Predicate> >
+      >
+{};
+
+
+template <typename BinaryFunction, typename Predicate>
+struct host_binary_transform_if_functor
+{
+  BinaryFunction binary_op;
+  Predicate pred;
+
+  host_binary_transform_if_functor(BinaryFunction _binary_op, Predicate _pred)
+    : binary_op(_binary_op), pred(_pred) {} 
+
+  template <typename Tuple>
+  inline __host__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<3,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<2>(t)))
+      thrust::get<3>(t) = binary_op(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end host_binary_transform_if_functor
+
+
+template <typename BinaryFunction, typename Predicate>
+struct device_binary_transform_if_functor
+{
+  BinaryFunction binary_op;
+  Predicate pred;
+
+  device_binary_transform_if_functor(BinaryFunction _binary_op, Predicate _pred)
+    : binary_op(_binary_op), pred(_pred) {} 
+
+  // add __host__ to allow the omp backend compile with nvcc
+  template <typename Tuple>
+  inline __host__ __device__
+  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
+    typename thrust::tuple_element<3,Tuple>::type
+  >::type
+    operator()(Tuple t)
+  {
+    if(pred(thrust::get<2>(t)))
+      thrust::get<3>(t) = binary_op(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end device_binary_transform_if_functor
+
+
+template<typename System, typename BinaryFunction, typename Predicate>
+  struct binary_transform_if_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_binary_transform_if_functor<BinaryFunction,Predicate> >,
+        thrust::detail::identity_<device_binary_transform_if_functor<BinaryFunction,Predicate> >
+      >
+{};
+
+
+template<typename T>
+  struct host_destroy_functor
+{
+  __host__
+  void operator()(T &x) const
+  {
+    x.~T();
+  } // end operator()()
+}; // end host_destroy_functor
+
+
+template<typename T>
+  struct device_destroy_functor
+{
+  // add __host__ to allow the omp backend to compile with nvcc
+  __host__ __device__
+  void operator()(T &x) const
+  {
+    x.~T();
+  } // end operator()()
+}; // end device_destroy_functor
+
+
+template<typename System, typename T>
+  struct destroy_functor
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+        thrust::detail::identity_<host_destroy_functor<T> >,
+        thrust::detail::identity_<device_destroy_functor<T> >
+      >
+{};
+
+
+template <typename T>
+struct fill_functor
+{
+  const T exemplar;
+
+  fill_functor(const T& _exemplar) 
+    : exemplar(_exemplar) {}
+
+  __host__ __device__
+  T operator()(void) const
+  { 
+    return exemplar;
+  }
+};
+
+
+template<typename T>
+  struct uninitialized_fill_functor
+{
+  T exemplar;
+
+  uninitialized_fill_functor(T x):exemplar(x){}
+
+  __host__ __device__
+  void operator()(T &x)
+  {
+    ::new(static_cast<void*>(&x)) T(exemplar);
+  } // end operator()()
+}; // end uninitialized_fill_functor
+
+
+// this predicate tests two two-element tuples
+// we first use a Compare for the first element
+// if the first elements are equivalent, we use
+// < for the second elements
+template<typename Compare>
+  struct compare_first_less_second
+{
+  compare_first_less_second(Compare c)
+    : comp(c) {}
+
+  template<typename T1, typename T2>
+  __host__ __device__
+  bool operator()(T1 lhs, T2 rhs)
+  {
+    return comp(thrust::get<0>(lhs), thrust::get<0>(rhs)) || (!comp(thrust::get<0>(rhs), thrust::get<0>(lhs)) && thrust::get<1>(lhs) < thrust::get<1>(rhs));
+  }
+
+  Compare comp;
+}; // end compare_first_less_second
+
+
+template<typename Compare>
+  struct compare_first
+{
+  Compare comp;
+
+  compare_first(Compare comp)
+    : comp(comp)
+  {}
+
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1 &x, const Tuple2 &y)
+  {
+    return comp(thrust::raw_reference_cast(thrust::get<0>(x)), thrust::raw_reference_cast(thrust::get<0>(y)));
+  }
+}; // end compare_first
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/logical.inl b/compat/thrust/detail/logical.inl
new file mode 100644
index 0000000..126a3e3
--- /dev/null
+++ b/compat/thrust/detail/logical.inl
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file logical.inl
+ *  \brief Inline file for logical.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/logical.h>
+#include <thrust/system/detail/adl/logical.h>
+
+namespace thrust
+{
+
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::all_of;
+  return all_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end all_of()
+
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::any_of;
+  return any_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end any_of()
+
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::none_of;
+  return none_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end none_of()
+
+
+template <typename InputIterator, typename Predicate>
+bool all_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::all_of(select_system(system), first, last, pred);
+}
+
+
+template <typename InputIterator, typename Predicate>
+bool any_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::any_of(select_system(system), first, last, pred);
+}
+
+
+template <typename InputIterator, typename Predicate>
+bool none_of(InputIterator first, InputIterator last, Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::none_of(select_system(system), first, last, pred);
+}
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/malloc_and_free.h b/compat/thrust/detail/malloc_and_free.h
new file mode 100644
index 0000000..57b1685
--- /dev/null
+++ b/compat/thrust/detail/malloc_and_free.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/malloc_and_free.h>
+
+namespace thrust
+{
+
+template<typename DerivedPolicy>
+pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
+{
+  using thrust::system::detail::generic::malloc;
+
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  void *raw_ptr = static_cast<void*>(thrust::raw_pointer_cast(malloc(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
+
+  return pointer<void,DerivedPolicy>(raw_ptr);
+}
+
+template<typename T, typename DerivedPolicy>
+pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
+{
+  using thrust::system::detail::generic::malloc;
+
+  T *raw_ptr = static_cast<T*>(thrust::raw_pointer_cast(malloc<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
+
+  return pointer<T,DerivedPolicy>(raw_ptr);
+}
+
+
+// XXX WAR nvbug 992955
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#if CUDA_VERSION < 5000
+
+// cudafe generates unqualified calls to free(int *volatile)
+// which get confused with thrust::free
+// spoof a thrust::free which simply maps to ::free
+inline __host__ __device__
+void free(int *volatile ptr)
+{
+  ::free(ptr);
+}
+
+#endif // CUDA_VERSION
+#endif // THRUST_DEVICE_COMPILER
+
+template<typename DerivedPolicy, typename Pointer>
+void free(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer ptr)
+{
+  using thrust::system::detail::generic::free;
+
+  free(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), ptr);
+}
+
+// XXX consider another form of free which does not take a system argument and
+// instead infers the system from the pointer
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/merge.inl b/compat/thrust/detail/merge.inl
new file mode 100644
index 0000000..77f09f5
--- /dev/null
+++ b/compat/thrust/detail/merge.inl
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file merge.inl
+ *  \brief Inline file for merge.h.
+ */
+
+#include <thrust/merge.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/merge.h>
+#include <thrust/system/detail/adl/merge.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  using thrust::system::detail::generic::merge;
+  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end merge()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp)
+{
+  using thrust::system::detail::generic::merge;
+  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end merge()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::merge_by_key;
+  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end merge_by_key()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::merge_by_key;
+  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end merge_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end merge()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end merge()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1,
+                 InputIterator1 keys_last1,
+                 InputIterator2 keys_first2,
+                 InputIterator2 keys_last2,
+                 InputIterator3 values_first1,
+                 InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end merge_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1,
+                 InputIterator1 keys_last1,
+                 InputIterator2 keys_first2,
+                 InputIterator2 keys_last2,
+                 InputIterator3 values_first1,
+                 InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end merge_by_key()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/minmax.h b/compat/thrust/detail/minmax.h
new file mode 100644
index 0000000..a560ea1
--- /dev/null
+++ b/compat/thrust/detail/minmax.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
+{
+  return comp(rhs, lhs) ? rhs : lhs;
+} // end min()
+
+template<typename T>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
+{
+  return rhs < lhs ? rhs : lhs;
+} // end min()
+
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
+{
+  return comp(lhs,rhs) ? rhs : lhs;
+} // end max()
+
+template<typename T>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
+{
+  return lhs < rhs ? rhs : lhs;
+} // end max()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/mismatch.inl b/compat/thrust/detail/mismatch.inl
new file mode 100644
index 0000000..37ac663
--- /dev/null
+++ b/compat/thrust/detail/mismatch.inl
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file mismatch.inl
+ *  \brief Inline file for mismatch.h
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/mismatch.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/mismatch.h>
+#include <thrust/system/detail/adl/mismatch.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2)
+{
+  using thrust::system::detail::generic::mismatch;
+  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
+} // end mismatch()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred)
+{
+  using thrust::system::detail::generic::mismatch;
+  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, pred);
+} // end mismatch()
+
+
+template <typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::mismatch(select_system(system1,system2), first1, last1, first2);
+} // end mismatch()
+
+
+template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::mismatch(select_system(system1,system2), first1, last1, first2, pred);
+} // end mismatch()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/mpl/math.h b/compat/thrust/detail/mpl/math.h
new file mode 100644
index 0000000..80adfc1
--- /dev/null
+++ b/compat/thrust/detail/mpl/math.h
@@ -0,0 +1,174 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file math.h
+ *  \brief Math-related metaprogramming functionality.
+ */
+
+
+#pragma once
+
+namespace thrust
+{
+
+namespace detail
+{
+
+namespace mpl
+{
+
+namespace math
+{
+
+namespace detail
+{
+
+// compute the log base-2 of an integer at compile time
+template <unsigned int N, unsigned int Cur>
+struct log2
+{
+    static const unsigned int value = log2<N / 2,Cur+1>::value;
+};
+
+template <unsigned int Cur>
+struct log2<1, Cur>
+{
+    static const unsigned int value = Cur;
+};
+
+template <unsigned int Cur>
+struct log2<0, Cur>
+{
+    // undefined
+};
+
+} // end namespace detail
+
+
+template <unsigned int N>
+struct log2
+{
+    static const unsigned int value = detail::log2<N,0>::value;
+};
+
+
+template <typename T, T lhs, T rhs>
+struct min
+{
+  static const T value = (lhs < rhs) ? lhs : rhs;
+};
+
+
+template <typename T, T lhs, T rhs>
+struct max
+{
+  static const T value = (!(lhs < rhs)) ? lhs : rhs;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct mul
+{
+  static const result_type value = x * y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct mod
+{
+  static const result_type value = x % y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct div
+{
+  static const result_type value = x / y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct geq
+{
+  static const bool value = x >= y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct lt
+{
+  static const bool value = x < y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct gt
+{
+  static const bool value = x > y;
+};
+
+
+template<bool x, bool y>
+  struct or_
+{
+  static const bool value = (x || y);
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct bit_and
+{
+  static const result_type value = x & y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct plus
+{
+  static const result_type value = x + y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct minus
+{
+  static const result_type value = x - y;
+};
+
+
+template<typename result_type, result_type x, result_type y>
+  struct equal
+{
+  static const bool value = x == y;
+};
+
+
+template<typename result_type, result_type x>
+  struct is_odd
+{
+  static const bool value = x & 1;
+};
+
+
+} // end namespace math
+
+} // end namespace mpl
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/numeric_traits.h b/compat/thrust/detail/numeric_traits.h
new file mode 100644
index 0000000..a3bc56c
--- /dev/null
+++ b/compat/thrust/detail/numeric_traits.h
@@ -0,0 +1,130 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <limits>
+
+//#include <stdint.h> // for intmax_t (not provided on MSVS 2005)
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// XXX good enough for the platforms we care about
+typedef long long intmax_t;
+
+template<typename Number>
+  struct is_signed
+    : integral_constant<bool, std::numeric_limits<Number>::is_signed>
+{}; // end is_signed
+
+
+template<typename T>
+  struct num_digits
+    : eval_if<
+        std::numeric_limits<T>::is_specialized,
+        integral_constant<
+          int,
+          std::numeric_limits<T>::digits
+        >,
+        integral_constant<
+          int,
+          sizeof(T) * std::numeric_limits<unsigned char>::digits - (is_signed<T>::value ? 1 : 0)  
+        >
+      >::type
+{}; // end num_digits
+
+
+template<typename Integer>
+  struct integer_difference
+    //: eval_if<
+    //    sizeof(Integer) >= sizeof(intmax_t),
+    //    eval_if<
+    //      is_signed<Integer>::value,
+    //      identity_<Integer>,
+    //      identity_<intmax_t>
+    //    >,
+    //    eval_if<
+    //      sizeof(Integer) < sizeof(std::ptrdiff_t),
+    //      identity_<std::ptrdiff_t>,
+    //      identity_<intmax_t>
+    //    >
+    //  >
+{
+  private:
+    // XXX workaround a pedantic warning in old versions of g++
+    //     which complains about &&ing with a constant value
+    template<bool x, bool y>
+      struct and_
+    {
+      static const bool value = false;
+    };
+
+    template<bool y>
+      struct and_<true,y>
+    {
+      static const bool value = y;
+    };
+
+  public:
+    typedef typename
+      eval_if<
+        and_<
+          std::numeric_limits<Integer>::is_signed,
+          // digits is the number of no-sign bits
+          (!std::numeric_limits<Integer>::is_bounded || (int(std::numeric_limits<Integer>::digits) + 1 >= num_digits<intmax_t>::value))
+        >::value,
+        identity_<Integer>,
+        eval_if<
+          int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed int>::value,
+          identity_<signed int>,
+          eval_if<
+            int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed long>::value,
+            identity_<signed long>,
+            identity_<intmax_t>
+          >
+        >
+      >::type type;
+}; // end integer_difference
+
+
+template<typename Number>
+  struct numeric_difference
+    : eval_if<
+      is_integral<Number>::value,
+      integer_difference<Number>,
+      identity_<Number>
+    >
+{}; // end numeric_difference
+
+
+template<typename Number>
+__host__ __device__
+typename numeric_difference<Number>::type
+numeric_distance(Number x, Number y)
+{
+  typedef typename numeric_difference<Number>::type difference_type;
+  return difference_type(y) - difference_type(x);
+} // end numeric_distance
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/overlapped_copy.h b/compat/thrust/detail/overlapped_copy.h
new file mode 100644
index 0000000..a5540b8
--- /dev/null
+++ b/compat/thrust/detail/overlapped_copy.h
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/detail/copy.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator sequential_copy(InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result)
+{
+  for(; first != last; ++first, ++result)
+  {
+    *result = *first;
+  } // end for
+
+  return result;
+} // end sequential_copy()
+
+
+template<typename BidirectionalIterator1,
+         typename BidirectionalIterator2>
+  BidirectionalIterator2 sequential_copy_backward(BidirectionalIterator1 first,
+                                                  BidirectionalIterator1 last,
+                                                  BidirectionalIterator2 result)
+{
+  // yes, we preincrement
+  // the ranges are open on the right, i.e. [first, last)
+  while(first != last)
+  {
+    *--result = *--last;
+  } // end while
+
+  return result;
+} // end sequential_copy_backward()
+
+
+namespace dispatch
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
+                                        RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  if(first < last && first <= result && result < last)
+  {
+    // result lies in [first, last)
+    // it's safe to use std::copy_backward here
+    thrust::detail::sequential_copy_backward(first, last, result + (last - first));
+    result += (last - first);
+  } // end if
+  else
+  {
+    // result + (last - first) lies in [first, last)
+    // it's safe to use sequential_copy here
+    result = thrust::detail::sequential_copy(first, last, result);
+  } // end else
+
+  return result;
+} // end overlapped_copy()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                                        RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+
+  // make a temporary copy of [first,last), and copy into it first
+  thrust::detail::temporary_array<value_type, DerivedPolicy> temp(exec, first, last);
+  return thrust::copy(exec, temp.begin(), temp.end(), result);
+} // end overlapped_copy()
+
+} // end dispatch
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 overlapped_copy(RandomAccessIterator1 first,
+                                        RandomAccessIterator1 last,
+                                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  typedef typename thrust::detail::minimum_system<System1, System2>::type System;
+
+  // XXX presumes System is default constructible
+  System system;
+
+  return thrust::detail::dispatch::overlapped_copy(system, first, last, result);
+} // end overlapped_copy()
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/pair.inl b/compat/thrust/detail/pair.inl
new file mode 100644
index 0000000..776bdc2
--- /dev/null
+++ b/compat/thrust/detail/pair.inl
@@ -0,0 +1,225 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/pair.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+
+template <typename T1, typename T2>
+  pair<T1,T2>
+    ::pair(void)
+      :first(),second()
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  pair<T1,T2>
+    ::pair(const T1 &x, const T2 &y)
+      :first(x),second(y)
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  template <typename U1, typename U2>
+    pair<T1,T2>
+      ::pair(const pair<U1,U2> &p)
+        :first(p.first),second(p.second)
+{
+  ;
+} // end pair::pair()
+
+
+template <typename T1, typename T2>
+  template <typename U1, typename U2>
+    pair<T1,T2>
+      ::pair(const std::pair<U1,U2> &p)
+        :first(p.first),second(p.second)
+{
+  ;
+} // end pair::pair()
+
+
+template<typename T1, typename T2>
+  inline __host__ __device__
+    void pair<T1,T2>
+      ::swap(thrust::pair<T1,T2> &p)
+{
+  using thrust::swap;
+
+  swap(first, p.first);
+  swap(second, p.second);
+} // end pair::swap()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return x.first == y.first && x.second == y.second;
+} // end operator==()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+} // end operator<()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(x == y);
+} // end operator==()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return y < x;
+} // end operator<()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(y < x);
+} // end operator<=()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y)
+{
+  return !(x < y);
+} // end operator>=()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    void swap(pair<T1,T2> &x, pair<T1,T2> &y)
+{
+  return x.swap(y);
+} // end swap()
+
+
+template <typename T1, typename T2>
+  inline __host__ __device__
+    pair<T1,T2> make_pair(T1 x, T2 y)
+{
+  return pair<T1,T2>(x,y);
+} // end make_pair()
+
+
+// specializations of tuple_element for pair
+template<typename T1, typename T2>
+  struct tuple_element<0, pair<T1,T2> >
+{
+  typedef T1 type;
+}; // end tuple_element
+
+template<typename T1, typename T2>
+  struct tuple_element<1, pair<T1,T2> >
+{
+  typedef T2 type;
+}; // end tuple_element
+
+
+// specialization of tuple_size for pair
+template<typename T1, typename T2>
+  struct tuple_size< pair<T1,T2 > >
+{
+  static const unsigned int value = 2;
+}; // end tuple_size
+
+
+
+namespace detail
+{
+
+
+template<int N, typename Pair> struct pair_get {};
+
+template<typename Pair>
+  struct pair_get<0, Pair>
+{
+  inline __host__ __device__
+    const typename tuple_element<0, Pair>::type &
+      operator()(const Pair &p) const
+  {
+    return p.first;
+  } // end operator()()
+
+  inline __host__ __device__
+    typename tuple_element<0, Pair>::type &
+      operator()(Pair &p) const
+  {
+    return p.first;
+  } // end operator()()
+}; // end pair_get
+
+
+template<typename Pair>
+  struct pair_get<1, Pair>
+{
+  inline __host__ __device__
+    const typename tuple_element<1, Pair>::type &
+      operator()(const Pair &p) const
+  {
+    return p.second;
+  } // end operator()()
+
+  inline __host__ __device__
+    typename tuple_element<1, Pair>::type &
+      operator()(Pair &p) const
+  {
+    return p.second;
+  } // end operator()()
+}; // end pair_get
+
+} // end detail
+
+
+
+template<unsigned int N, typename T1, typename T2>
+  inline __host__ __device__
+    typename tuple_element<N, pair<T1,T2> >::type &
+      get(pair<T1,T2> &p)
+{
+  return detail::pair_get<N, pair<T1,T2> >()(p);
+} // end get()
+
+template<unsigned int N, typename T1, typename T2>
+  inline __host__ __device__
+    const typename tuple_element<N, pair<T1,T2> >::type &
+      get(const pair<T1,T2> &p)
+{
+  return detail::pair_get<N, pair<T1,T2> >()(p);
+} // end get()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/partition.inl b/compat/thrust/detail/partition.inl
new file mode 100644
index 0000000..19ef08a
--- /dev/null
+++ b/compat/thrust/detail/partition.inl
@@ -0,0 +1,398 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.inl
+ *  \brief Inline file for partition.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/partition.h>
+#include <thrust/system/detail/adl/partition.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::partition;
+  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::partition;
+  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::partition_copy;
+  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::partition_copy;
+  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition;
+  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition;
+  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition_copy;
+  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::stable_partition_copy;
+  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  using thrust::system::detail::generic::partition_point;
+  return partition_point(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end partition_point()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::is_partitioned;
+  return is_partitioned(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end is_partitioned()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::partition(select_system(system), first, last, pred);
+} // end partition()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::partition(select_system(system1,system2), first, last, stencil, pred);
+} // end partition()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::stable_partition(select_system(system), first, last, pred);
+} // end stable_partition()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_partition(select_system(system1,system2), first, last, stencil, pred);
+} // end stable_partition()
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator1>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
+} // end partition_copy()
+
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::stable_partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type   System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type   System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type  System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::stable_partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::partition_point(select_system(system), first, last, pred);
+} // end partition_point()
+
+
+template<typename InputIterator, typename Predicate>
+  bool is_partitioned(InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::is_partitioned(select_system(system), first, last, pred);
+} // end is_partitioned()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/pointer.h b/compat/thrust/detail/pointer.h
new file mode 100644
index 0000000..bc97939
--- /dev/null
+++ b/compat/thrust/detail/pointer.h
@@ -0,0 +1,184 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/reference_forward_declaration.h>
+
+namespace thrust
+{
+
+// declare pointer with default values of template parameters
+template<typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default> class pointer;
+
+} // end thrust
+
+
+// specialize std::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace std
+{
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct iterator_traits<thrust::pointer<Element,Tag,Reference,Derived> >
+{
+  private:
+    typedef thrust::pointer<Element,Tag,Reference,Derived> ptr;
+
+  public:
+    typedef typename ptr::iterator_category iterator_category;
+    typedef typename ptr::value_type        value_type;
+    typedef typename ptr::difference_type   difference_type;
+    // XXX implement this type (the result of operator->) later
+    typedef void                             pointer;
+    typedef typename ptr::reference         reference;
+}; // end iterator_traits
+
+} // end std
+
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// this metafunction computes the type of iterator_adaptor thrust::pointer should inherit from
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_base
+{
+  // void pointers should have no element type
+  // note that we remove_cv from the Element type to get the value_type
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::identity_<void>,
+    thrust::detail::remove_cv<Element>
+  >::type value_type;
+
+  // if no Derived type is given, just use pointer
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_same<Derived,use_default>::value,
+    thrust::detail::identity_<pointer<Element,Tag,Reference,Derived> >,
+    thrust::detail::identity_<Derived>
+  >::type derived_type;
+
+  // void pointers should have no reference type
+  // if no Reference type is given, just use reference
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::identity_<void>,
+    thrust::detail::eval_if<
+      thrust::detail::is_same<Reference,use_default>::value,
+      thrust::detail::identity_<reference<Element,derived_type> >,
+      thrust::detail::identity_<Reference>
+    >
+  >::type reference_arg;
+
+  typedef thrust::iterator_adaptor<
+    derived_type,                        // pass along the type of our Derived class to iterator_adaptor
+    Element *,                           // we adapt a raw pointer
+    value_type,                          // the value type
+    Tag,                                 // system tag
+    thrust::random_access_traversal_tag, // pointers have random access traversal
+    reference_arg,                       // pass along our Reference type
+    std::ptrdiff_t
+  > type;
+}; // end pointer_base
+
+
+} // end detail
+
+
+// the base type for all of thrust's tagged pointers.
+// for reasonable pointer-like semantics, derived types should reimplement the following:
+// 1. no-argument constructor
+// 2. constructor from OtherElement *
+// 3. constructor from OtherPointer related by convertibility
+// 4. assignment from OtherPointer related by convertibility
+// These should just call the corresponding members of pointer.
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  class pointer
+    : public thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type
+{
+  private:
+    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type         super_t;
+
+    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::derived_type derived_type;
+
+    // friend iterator_core_access to give it access to dereference
+    friend class thrust::iterator_core_access;
+
+    __host__ __device__
+    typename super_t::reference dereference() const;
+
+    // don't provide access to this part of super_t's interface
+    using super_t::base;
+    using typename super_t::base_type;
+
+  public:
+    typedef typename super_t::base_type raw_pointer;
+
+    // constructors
+    
+    __host__ __device__
+    pointer();
+
+    // OtherValue shall be convertible to Value
+    // XXX consider making the pointer implementation a template parameter which defaults to Element *
+    template<typename OtherElement>
+    __host__ __device__
+    explicit pointer(OtherElement *ptr);
+
+    // OtherPointer's element_type shall be convertible to Element
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
+    // assignment
+    
+    // OtherPointer's element_type shall be convertible to Element
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      derived_type &
+    >::type
+    operator=(const OtherPointer &other);
+
+    // observers
+
+    __host__ __device__
+    Element *get() const;
+}; // end pointer
+
+} // end thrust
+
+#include <thrust/detail/pointer.inl>
+
diff --git a/compat/thrust/detail/pointer.inl b/compat/thrust/detail/pointer.inl
new file mode 100644
index 0000000..1d066b0
--- /dev/null
+++ b/compat/thrust/detail/pointer.inl
@@ -0,0 +1,143 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/pointer.h>
+
+
+namespace thrust
+{
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  pointer<Element,Tag,Reference,Derived>
+    ::pointer()
+      : super_t(static_cast<Element*>(0))
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherElement>
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(OtherElement *other)
+        : super_t(other)
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(const OtherPointer &other,
+                typename thrust::detail::enable_if_pointer_is_convertible<
+                  OtherPointer,
+                  pointer<Element,Tag,Reference,Derived>
+                 >::type *)
+        : super_t(thrust::detail::pointer_traits<OtherPointer>::get(other))
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer<Element,Tag,Reference,Derived>,
+      typename pointer<Element,Tag,Reference,Derived>::derived_type &
+    >::type
+      pointer<Element,Tag,Reference,Derived>
+        ::operator=(const OtherPointer &other)
+{
+  super_t::base_reference() = thrust::detail::pointer_traits<OtherPointer>::get(other);
+  return static_cast<derived_type&>(*this);
+} // end pointer::operator=
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  typename pointer<Element,Tag,Reference,Derived>::super_t::reference
+    pointer<Element,Tag,Reference,Derived>
+      ::dereference() const
+{
+  return typename super_t::reference(static_cast<const derived_type&>(*this));
+} // end pointer::dereference
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  Element *pointer<Element,Tag,Reference,Derived>
+    ::get() const
+{
+  return super_t::base();
+} // end pointer::get
+
+
+namespace detail
+{
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+// XXX WAR MSVC 2005 problem with correctly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_raw_pointer< thrust::pointer<Element,Tag,Reference,Derived> >
+{
+  typedef typename pointer<Element,Tag,Reference,Derived>::raw_pointer type;
+}; // end pointer_raw_pointer
+#endif
+
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40200)
+// XXX WAR g++-4.1 problem with correctly implementing
+//     pointer_element for pointer by specializing it here
+template<typename Element, typename Tag>
+  struct pointer_element< thrust::pointer<Element,Tag> >
+{
+  typedef Element type;
+}; // end pointer_element
+
+template<typename Element, typename Tag, typename Reference>
+  struct pointer_element< thrust::pointer<Element,Tag,Reference> >
+    : pointer_element< thrust::pointer<Element,Tag> >
+{}; // end pointer_element
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  struct pointer_element< thrust::pointer<Element,Tag,Reference,Derived> >
+    : pointer_element< thrust::pointer<Element,Tag,Reference> >
+{}; // end pointer_element
+
+
+
+// XXX WAR g++-4.1 problem with correctly implementing
+//     rebind_pointer for pointer by specializing it here
+template<typename Element, typename Tag, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
+{
+  // XXX note we don't attempt to rebind the pointer's Reference type (or Derived)
+  typedef thrust::pointer<NewElement,Tag> type;
+};
+
+template<typename Element, typename Tag, typename Reference, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
+    : rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
+{};
+
+template<typename Element, typename Tag, typename Reference, typename Derived, typename NewElement>
+  struct rebind_pointer<thrust::pointer<Element,Tag,Reference,Derived>, NewElement>
+    : rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
+{};
+#endif
+
+} // end namespace detail
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/range/tail_flags.h b/compat/thrust/detail/range/tail_flags.h
new file mode 100644
index 0000000..06fd9f8
--- /dev/null
+++ b/compat/thrust/detail/range/tail_flags.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename RandomAccessIterator,
+         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
+         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
+  class tail_flags
+{
+  private:
+    struct tail_flag_functor
+    {
+      BinaryPredicate binary_pred; // this must be the first member for performance reasons
+      IndexType n;
+
+      typedef bool result_type;
+
+      tail_flag_functor(IndexType n)
+        : binary_pred(), n(n)
+      {}
+
+      tail_flag_functor(IndexType n, BinaryPredicate binary_pred)
+        : binary_pred(binary_pred), n(n)
+      {}
+
+      template<typename Tuple>
+      __host__ __device__ __thrust_forceinline__
+      result_type operator()(const Tuple &t)
+      {
+        const IndexType i = thrust::get<0>(t);
+
+        // note that we do not dereference the tuple's 2nd element when i >= n
+        // and therefore do not dereference a bad location at the boundary
+        return (i == (n - 1) || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
+      }
+    };
+
+    typedef thrust::counting_iterator<IndexType> counting_iterator;
+
+  public:
+    typedef thrust::transform_iterator<
+      tail_flag_functor,
+      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
+    > iterator;
+
+    tail_flags(RandomAccessIterator first, RandomAccessIterator last)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first + 1)),
+                                                tail_flag_functor(last - first))),
+        m_end(m_begin + (last - first))
+    {}
+
+    tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first + 1)),
+                                                tail_flag_functor(last - first, binary_pred))),
+        m_end(m_begin + (last - first))
+    {}
+
+    iterator begin() const
+    {
+      return m_begin;
+    }
+
+    iterator end() const
+    {
+      return m_end;
+    }
+
+    template<typename OtherIndex>
+    typename iterator::reference operator[](OtherIndex i)
+    {
+      return *(begin() + i);
+    }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+template<typename RandomAccessIterator, typename BinaryPredicate>
+  tail_flags<RandomAccessIterator, BinaryPredicate>
+    make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
+{
+  return tail_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
+}
+
+
+template<typename RandomAccessIterator>
+  tail_flags<RandomAccessIterator>
+    make_tail_flags(RandomAccessIterator first, RandomAccessIterator last)
+{
+  return tail_flags<RandomAccessIterator>(first, last);
+}
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/raw_pointer_cast.h b/compat/thrust/detail/raw_pointer_cast.h
new file mode 100644
index 0000000..05e1e6b
--- /dev/null
+++ b/compat/thrust/detail/raw_pointer_cast.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+
+template<typename Pointer>
+  inline __host__ __device__ typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+    raw_pointer_cast(const Pointer &ptr)
+{
+  return thrust::detail::pointer_traits<Pointer>::get(ptr);
+} // end raw_pointer_cast()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/raw_reference_cast.h b/compat/thrust/detail/raw_reference_cast.h
new file mode 100644
index 0000000..1ffd7e5
--- /dev/null
+++ b/compat/thrust/detail/raw_reference_cast.h
@@ -0,0 +1,121 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(is_wrapped_reference, wrapped_reference_hint)
+
+namespace raw_reference_detail
+{
+
+template<typename T, typename Enable = void>
+  struct raw_reference
+    : add_reference<T>
+{};
+
+
+// XXX consider making raw_reference<T&> an error
+
+
+template<typename T>
+  struct raw_reference<
+    T,
+    typename thrust::detail::enable_if<
+      is_wrapped_reference<
+        typename remove_cv<T>::type
+      >::value
+    >::type
+  >
+{
+  typedef typename add_reference<
+    typename pointer_element<typename T::pointer>::type
+  >::type type;
+};
+
+} // end raw_reference_ns
+
+template<typename T>
+  struct raw_reference : 
+    raw_reference_detail::raw_reference<T>
+{};
+
+
+// wrapped reference-like things which aren't strictly wrapped references
+// (e.g. tuples of wrapped references) are considered unwrappable
+template<typename T>
+  struct is_unwrappable
+    : is_wrapped_reference<T>
+{};
+
+
+template<typename T, typename Result = void>
+  struct enable_if_unwrappable
+    : enable_if<
+        is_unwrappable<T>::value,
+        Result
+      >
+{};
+
+
+} // end detail
+
+
+template<typename T>
+  inline __host__ __device__ typename detail::raw_reference<T>::type raw_reference_cast(T &ref)
+{
+  return *thrust::raw_pointer_cast(&ref);
+} // end raw_reference_cast
+
+
+template<typename T>
+  inline __host__ __device__ typename detail::raw_reference<const T>::type raw_reference_cast(const T &ref)
+{
+  return *thrust::raw_pointer_cast(&ref);
+} // end raw_reference_cast
+
+
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+inline __host__ __device__
+typename detail::enable_if_unwrappable<
+  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+>::type
+raw_reference_cast(detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t);
+
+
+} // end thrust
+
+#include <thrust/detail/raw_reference_cast.inl>
+
diff --git a/compat/thrust/detail/raw_reference_cast.inl b/compat/thrust/detail/raw_reference_cast.inl
new file mode 100644
index 0000000..ea619ec
--- /dev/null
+++ b/compat/thrust/detail/raw_reference_cast.inl
@@ -0,0 +1,277 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/tuple_transform.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// specialize is_unwrappable
+// a tuple is_unwrappable if any of its elements is_unwrappable
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct is_unwrappable<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+    : or_<
+        is_unwrappable<T0>,
+        is_unwrappable<T1>,
+        is_unwrappable<T2>,
+        is_unwrappable<T3>,
+        is_unwrappable<T4>,
+        is_unwrappable<T5>,
+        is_unwrappable<T6>,
+        is_unwrappable<T7>,
+        is_unwrappable<T8>,
+        is_unwrappable<T9>
+      >
+{};
+
+
+// specialize is_unwrappable
+// a tuple_of_iterator_references is_unwrappable if any of its elements is_unwrappable
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct is_unwrappable<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+    : or_<
+        is_unwrappable<T0>,
+        is_unwrappable<T1>,
+        is_unwrappable<T2>,
+        is_unwrappable<T3>,
+        is_unwrappable<T4>,
+        is_unwrappable<T5>,
+        is_unwrappable<T6>,
+        is_unwrappable<T7>,
+        is_unwrappable<T8>,
+        is_unwrappable<T9>
+      >
+{};
+
+
+namespace raw_reference_detail
+{
+
+// unlike raw_reference,
+// raw_reference_tuple_helper needs to return a value
+// when it encounters one, rather than a reference
+// upon encountering tuple, recurse
+//
+// we want the following behavior:
+//  1. T                                -> T
+//  2. T&                               -> T&
+//  3. null_type                        -> null_type
+//  4. reference<T>                     -> T&
+//  5. tuple_of_iterator_references<T>  -> tuple_of_iterator_references<raw_reference_tuple_helper<T>::type>
+
+
+// wrapped references are unwrapped using raw_reference, otherwise, return T
+template<typename T>
+  struct raw_reference_tuple_helper
+    : eval_if<
+        is_unwrappable<
+          typename remove_cv<T>::type
+        >::value,
+        raw_reference<T>,
+        identity_<T>
+      >
+{};
+
+
+// recurse on tuples
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference_tuple_helper<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  typedef thrust::tuple<
+    typename raw_reference_tuple_helper<T0>::type,
+    typename raw_reference_tuple_helper<T1>::type,
+    typename raw_reference_tuple_helper<T2>::type,
+    typename raw_reference_tuple_helper<T3>::type,
+    typename raw_reference_tuple_helper<T4>::type,
+    typename raw_reference_tuple_helper<T5>::type,
+    typename raw_reference_tuple_helper<T6>::type,
+    typename raw_reference_tuple_helper<T7>::type,
+    typename raw_reference_tuple_helper<T8>::type,
+    typename raw_reference_tuple_helper<T9>::type
+  > type;
+};
+
+
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference_tuple_helper<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename raw_reference_tuple_helper<T0>::type,
+    typename raw_reference_tuple_helper<T1>::type,
+    typename raw_reference_tuple_helper<T2>::type,
+    typename raw_reference_tuple_helper<T3>::type,
+    typename raw_reference_tuple_helper<T4>::type,
+    typename raw_reference_tuple_helper<T5>::type,
+    typename raw_reference_tuple_helper<T6>::type,
+    typename raw_reference_tuple_helper<T7>::type,
+    typename raw_reference_tuple_helper<T8>::type,
+    typename raw_reference_tuple_helper<T9>::type
+  > type;
+};
+
+
+} // end raw_reference_detail
+
+
+// if a tuple "tuple_type" is_unwrappable,
+//   then the raw_reference of tuple_type is a tuple of its members' raw_references
+//   else the raw_reference of tuple_type is tuple_type &
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference<
+    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  private:
+    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+
+  public:
+    typedef typename eval_if<
+      is_unwrappable<tuple_type>::value,
+      raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
+      add_reference<tuple_type>
+    >::type type;
+};
+
+
+template <
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  struct raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >
+{
+  private:
+    typedef detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+
+  public:
+    typedef typename raw_reference_detail::raw_reference_tuple_helper<tuple_type>::type type;
+
+    // XXX figure out why is_unwrappable seems to be broken for tuple_of_iterator_references
+    //typedef typename eval_if<
+    //  is_unwrappable<tuple_type>::value,
+    //  raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
+    //  add_reference<tuple_type>
+    //>::type type;
+};
+
+
+struct raw_reference_caster
+{
+  template<typename T>
+  __host__ __device__
+  typename detail::raw_reference<T>::type operator()(T &ref)
+  {
+    return thrust::raw_reference_cast(ref);
+  }
+
+  template<typename T>
+  __host__ __device__
+  typename detail::raw_reference<const T>::type operator()(const T &ref)
+  {
+    return thrust::raw_reference_cast(ref);
+  }
+
+  template<
+    typename T0, typename T1, typename T2,
+    typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8,
+    typename T9
+  >
+  __host__ __device__
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+  operator()(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t,
+             typename enable_if<
+               is_unwrappable<thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> >::value
+             >::type * = 0)
+  {
+    return thrust::raw_reference_cast(t);
+  }
+}; // end raw_reference_caster
+
+
+} // end detail
+
+
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+__host__ __device__
+typename detail::enable_if_unwrappable<
+  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  typename detail::raw_reference<
+    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+  >::type
+>::type
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t)
+{
+  thrust::detail::raw_reference_caster f;
+
+  // note that we pass raw_reference_tuple_helper, not raw_reference as the unary metafunction
+  // the subtle difference is important
+  return thrust::detail::tuple_host_device_transform<detail::raw_reference_detail::raw_reference_tuple_helper>(t, f);
+} // end raw_reference_cast
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/reduce.inl b/compat/thrust/detail/reduce.inl
new file mode 100644
index 0000000..ba84423
--- /dev/null
+++ b/compat/thrust/detail/reduce.inl
@@ -0,0 +1,261 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.inl
+ *  \brief Inline file for reduce.h.
+ */
+
+#include <thrust/reduce.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/reduce.h>
+#include <thrust/system/detail/generic/reduce_by_key.h>
+#include <thrust/system/detail/adl/reduce.h>
+#include <thrust/system/detail/adl/reduce_by_key.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator>
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end reduce()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
+} // end reduce()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::reduce;
+  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, binary_op);
+} // end reduce()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
+} // end reduce_by_key()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end reduce_by_key()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::reduce_by_key;
+  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+} // end reduce_by_key()
+
+
+template<typename InputIterator>
+typename thrust::iterator_traits<InputIterator>::value_type
+  reduce(InputIterator first,
+         InputIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last);
+}
+
+template<typename InputIterator,
+         typename T>
+   T reduce(InputIterator first,
+            InputIterator last,
+            T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last, init);
+}
+
+
+template<typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+   T reduce(InputIterator first,
+            InputIterator last,
+            T init,
+            BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::reduce(select_system(system), first, last, init, binary_op);
+}
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
+}
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+}
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+}
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/reference.h b/compat/thrust/detail/reference.h
new file mode 100644
index 0000000..8c0b061
--- /dev/null
+++ b/compat/thrust/detail/reference.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/detail/reference_forward_declaration.h>
+
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename> struct is_wrapped_reference;
+
+}
+
+// the base type for all of thrust's system-annotated references.
+// for reasonable reference-like semantics, derived types must reimplement the following:
+// 1. constructor from pointer
+// 2. copy constructor
+// 3. templated copy constructor from other reference
+// 4. templated assignment from other reference
+// 5. assignment from value_type
+template<typename Element, typename Pointer, typename Derived>
+  class reference
+{
+  private:
+    typedef typename thrust::detail::eval_if<
+      thrust::detail::is_same<Derived,use_default>::value,
+      thrust::detail::identity_<reference>,
+      thrust::detail::identity_<Derived>
+    >::type derived_type;
+
+    // hint for is_wrapped_reference lets it know that this type (or a derived type)
+    // is a wrapped reference
+    struct wrapped_reference_hint {};
+    template<typename> friend struct thrust::detail::is_wrapped_reference;
+
+  public:
+    typedef Pointer                                              pointer;
+    typedef typename thrust::detail::remove_const<Element>::type value_type;
+
+    __host__ __device__
+    explicit reference(const pointer &ptr);
+
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                pointer
+              >::type * = 0);
+
+    __host__ __device__
+    derived_type &operator=(const reference &other);
+
+    // XXX this may need an enable_if
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
+
+    __host__ __device__
+    derived_type &operator=(const value_type &x);
+
+    __host__ __device__
+    pointer operator&() const;
+
+    __host__ __device__
+    operator value_type () const;
+
+    __host__ __device__
+    void swap(derived_type &other);
+
+    derived_type &operator++();
+
+    value_type operator++(int);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator+=(const value_type &rhs);
+
+    derived_type &operator--();
+
+    value_type operator--(int);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator-=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator*=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator/=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator%=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator<<=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator>>=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator&=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator|=(const value_type &rhs);
+
+    // XXX parameterize the type of rhs
+    derived_type &operator^=(const value_type &rhs);
+
+  private:
+    const pointer m_ptr;
+
+    // allow access to m_ptr for other references
+    template <typename OtherElement, typename OtherPointer, typename OtherDerived> friend class reference;
+
+    template<typename System>
+    __host__ __device__
+    inline value_type strip_const_get_value(const System &system) const;
+
+    template<typename OtherPointer>
+    __host__ __device__
+    inline void assign_from(OtherPointer src);
+
+    // XXX this helper exists only to avoid warnings about null references from the other assign_from
+    template<typename System1, typename System2, typename OtherPointer>
+    inline __host__ __device__
+    void assign_from(System1 *system1, System2 *system2, OtherPointer src);
+
+    template<typename System, typename OtherPointer>
+    __host__ __device__
+    inline void strip_const_assign_value(const System &system, OtherPointer src);
+
+    // XXX this helper exists only to avoid warnings about null references from the other swap
+    template<typename System>
+    inline __host__ __device__
+    void swap(System *system, derived_type &other);
+
+    // XXX this helper exists only to avoid warnings about null references from operator value_type ()
+    template<typename System>
+    inline __host__ __device__
+    value_type convert_to_value_type(System *system) const;
+}; // end reference
+
+  
+} // end thrust
+
+#include <thrust/detail/reference.inl>
+
diff --git a/compat/thrust/detail/reference.inl b/compat/thrust/detail/reference.inl
new file mode 100644
index 0000000..8b55edb
--- /dev/null
+++ b/compat/thrust/detail/reference.inl
@@ -0,0 +1,361 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/reference.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/get_value.h>
+#include <thrust/system/detail/adl/assign_value.h>
+#include <thrust/system/detail/adl/iter_swap.h>
+
+
+namespace thrust
+{
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    reference<Element,Pointer,Derived>
+      ::reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+                  typename thrust::detail::enable_if_convertible<
+                    typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                    pointer
+                  >::type *)
+        : m_ptr(other.m_ptr)
+{}
+
+
+template<typename Element, typename Pointer, typename Derived>
+  reference<Element,Pointer,Derived>
+    ::reference(const pointer &ptr)
+      : m_ptr(ptr)
+{}
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::pointer
+    reference<Element,Pointer,Derived>
+      ::operator&() const
+{
+  return m_ptr;
+} // end reference::operator&()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator=(const value_type &v)
+{
+  assign_from(&v);
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator=(const reference &other)
+{
+  assign_from(&other); 
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    typename reference<Element,Pointer,Derived>::derived_type &
+      reference<Element,Pointer,Derived>
+        ::operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other)
+{
+  assign_from(&other);
+  return static_cast<derived_type&>(*this);
+} // end reference::operator=()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    typename reference<Element,Pointer,Derived>::value_type
+      reference<Element,Pointer,Derived>
+        ::convert_to_value_type(System *system) const
+{
+  using thrust::system::detail::generic::select_system;
+  return strip_const_get_value(select_system(*system));
+} // end convert_to_value_type()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  reference<Element,Pointer,Derived>
+    ::operator typename reference<Element,Pointer,Derived>::value_type () const
+{
+  typedef typename thrust::iterator_system<pointer>::type System;
+
+  // XXX avoid default-constructing a system
+  // XXX use null a reference for dispatching
+  // XXX this assumes that the eventual invocation of
+  // XXX get_value will not access system state
+  System *system = 0;
+
+  return convert_to_value_type(system);
+} // end reference::operator value_type ()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    typename reference<Element,Pointer,Derived>::value_type
+      reference<Element,Pointer,Derived>
+        ::strip_const_get_value(const System &system) const
+{
+  System &non_const_system = const_cast<System&>(system);
+
+  using thrust::system::detail::generic::get_value;
+
+  return get_value(thrust::detail::derived_cast(non_const_system), m_ptr);
+} // end reference::strip_const_get_value()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System1, typename System2, typename OtherPointer>
+    void reference<Element,Pointer,Derived>
+      ::assign_from(System1 *system1, System2 *system2, OtherPointer src)
+{
+  using thrust::system::detail::generic::select_system;
+
+  strip_const_assign_value(select_system(*system1, *system2), src);
+} // end assign_from()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename OtherPointer>
+    void reference<Element,Pointer,Derived>
+      ::assign_from(OtherPointer src)
+{
+  typedef typename thrust::iterator_system<pointer>::type      System1;
+  typedef typename thrust::iterator_system<OtherPointer>::type System2;
+
+  // XXX avoid default-constructing a system
+  // XXX use null references for dispatching
+  // XXX this assumes that the eventual invocation of
+  // XXX assign_value will not access system state
+  System1 *system1 = 0;
+  System2 *system2 = 0;
+
+  assign_from(system1, system2, src);
+} // end assign_from()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System, typename OtherPointer>
+    void reference<Element,Pointer,Derived>
+      ::strip_const_assign_value(const System &system, OtherPointer src)
+{
+  System &non_const_system = const_cast<System&>(system);
+
+  using thrust::system::detail::generic::assign_value;
+
+  assign_value(thrust::detail::derived_cast(non_const_system), m_ptr, src);
+} // end strip_const_assign_value()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  template<typename System>
+    void reference<Element,Pointer,Derived>
+      ::swap(System *system, derived_type &other)
+{
+  using thrust::system::detail::generic::select_system;
+  using thrust::system::detail::generic::iter_swap;
+
+  iter_swap(select_system(*system, *system), m_ptr, other.m_ptr);
+} // end reference::swap()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  void reference<Element,Pointer,Derived>
+    ::swap(derived_type &other)
+{
+  typedef typename thrust::iterator_system<pointer>::type System;
+
+  // XXX avoid default-constructing a system
+  // XXX use null references for dispatching
+  // XXX this assumes that the eventual invocation
+  // XXX of iter_swap will not access system state
+  System *system = 0;
+
+  swap(system, other);
+} // end reference::swap()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator++(void)
+{
+  value_type temp = *this;
+  ++temp;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator++()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::value_type
+    reference<Element,Pointer,Derived>
+      ::operator++(int)
+{
+  value_type temp = *this;
+  value_type result = temp++;
+  *this = temp;
+  return result;
+} // end reference::operator++()
+
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator+=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp += rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator+=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator--(void)
+{
+  value_type temp = *this;
+  --temp;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator--()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::value_type
+    reference<Element,Pointer,Derived>
+      ::operator--(int)
+{
+  value_type temp = *this;
+  value_type result = temp--;
+  *this = temp;
+  return result;
+} // end reference::operator--()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator-=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp -= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator-=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator*=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp *= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator*=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator/=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp /= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator/=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator%=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp %= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator%=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator<<=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp <<= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator<<=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator>>=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp >>= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator>>=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator&=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp &= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator&=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator|=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp |= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator|=()
+
+template<typename Element, typename Pointer, typename Derived>
+  typename reference<Element,Pointer,Derived>::derived_type &
+    reference<Element,Pointer,Derived>
+      ::operator^=(const value_type &rhs)
+{
+  value_type temp = *this;
+  temp ^= rhs;
+  *this = temp;
+  return static_cast<derived_type&>(*this);
+} // end reference::operator^=()
+
+  
+} // end thrust
+
diff --git a/compat/thrust/detail/reference_forward_declaration.h b/compat/thrust/detail/reference_forward_declaration.h
new file mode 100644
index 0000000..60524d3
--- /dev/null
+++ b/compat/thrust/detail/reference_forward_declaration.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/use_default.h>
+
+namespace thrust
+{
+
+template<typename Element, typename Pointer, typename Derived = use_default> class reference;
+
+} // end thrust
+
diff --git a/compat/thrust/detail/remove.inl b/compat/thrust/detail/remove.inl
new file mode 100644
index 0000000..5675243
--- /dev/null
+++ b/compat/thrust/detail/remove.inl
@@ -0,0 +1,238 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.inl
+ *  \brief Inline file for remove.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/remove.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/remove.h>
+#include <thrust/system/detail/adl/remove.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  using thrust::system::detail::generic::remove;
+  return remove(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
+} // end remove()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  using thrust::system::detail::generic::remove_copy;
+  return remove_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, value);
+} // end remove_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::remove_if;
+  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
+} // end remove_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::remove_copy_if;
+  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
+} // end remove_copy_if()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::remove_if;
+  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
+} // end remove_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::remove_copy_if;
+  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
+} // end remove_copy_if()
+
+
+template<typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::remove(select_system(system), first, last, value);
+} // end remove()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_copy(select_system(system1,system2), first, last, result, value);
+} // end remove_copy()
+
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::remove_if(select_system(system), first, last, pred);
+} // end remove_if()
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_if(select_system(system1,system2), first, last, stencil, pred);
+} // end remove_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::remove_copy_if(select_system(system1,system2), first, last, result, pred);
+} // end remove_copy_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::remove_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
+} // end remove_copy_if()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/replace.inl b/compat/thrust/detail/replace.inl
new file mode 100644
index 0000000..1eaf24d
--- /dev/null
+++ b/compat/thrust/detail/replace.inl
@@ -0,0 +1,210 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file replace.inl
+ *  \brief Inline file for replace.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/replace.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/replace.h>
+#include <thrust/system/detail/adl/replace.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  using thrust::system::detail::generic::replace;
+  return replace(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, old_value, new_value);
+} // end replace()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::replace_if;
+  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred, new_value);
+} // end replace_if()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::replace_if;
+  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred, new_value);
+} // end replace_if()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy;
+  return replace_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, old_value, new_value);
+} // end replace_copy()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy_if;
+  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::replace_copy_if;
+  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_copy_if(select_system(system1,system2), first, last, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::replace_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred, new_value);
+} // end replace_copy_if()
+
+
+template<typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_copy(select_system(system1,system2), first, last, result, old_value, new_value);
+} // end replace_copy()
+
+
+template<typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::replace_if(select_system(system), first, last, pred, new_value);
+} // end replace_if()
+
+
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
+  typedef typename thrust::iterator_system<InputIterator>::type   System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::replace_if(select_system(system1,system2), first, last, stencil, pred, new_value);
+} // end replace_if()
+
+
+template<typename ForwardIterator, typename T>
+  void replace(ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::replace(select_system(system), first, last, old_value, new_value);
+} // end replace()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/reverse.inl b/compat/thrust/detail/reverse.inl
new file mode 100644
index 0000000..18c26c0
--- /dev/null
+++ b/compat/thrust/detail/reverse.inl
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reverse.inl
+ *  \brief Inline file for reverse.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/reverse.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/reverse.h>
+#include <thrust/system/detail/adl/reverse.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename BidirectionalIterator>
+  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  using thrust::system::detail::generic::reverse;
+  return reverse(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end reverse()
+
+
+template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
+  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  using thrust::system::detail::generic::reverse_copy;
+  return reverse_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end reverse_copy()
+
+
+template<typename BidirectionalIterator>
+  void reverse(BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<BidirectionalIterator>::type System;
+
+  System system;
+
+  return thrust::reverse(select_system(system), first, last);
+} // end reverse()
+
+
+template<typename BidirectionalIterator,
+         typename OutputIterator>
+  OutputIterator reverse_copy(BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<BidirectionalIterator>::type System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type        System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::reverse_copy(select_system(system1,system2), first, last, result);
+} // end reverse_copy()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/scan.inl b/compat/thrust/detail/scan.inl
new file mode 100644
index 0000000..3e5fd9b
--- /dev/null
+++ b/compat/thrust/detail/scan.inl
@@ -0,0 +1,502 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.inl
+ *  \brief Inline file for scan.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/scan.h>
+#include <thrust/system/detail/generic/scan_by_key.h>
+#include <thrust/system/detail/adl/scan.h>
+#include <thrust/system/detail/adl/scan_by_key.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::inclusive_scan;
+  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end inclusive_scan() 
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::inclusive_scan;
+  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
+} // end inclusive_scan()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end exclusive_scan()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init);
+} // end exclusive_scan()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::exclusive_scan;
+  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op);
+} // end exclusive_scan()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
+} // end inclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred);
+} // end inclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::inclusive_scan_by_key;
+  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred, binary_op);
+} // end inclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
+} // end exclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init);
+} // end exclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred);
+} // end exclusive_scan_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::exclusive_scan_by_key;
+  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred, binary_op);
+} // end exclusive_scan_by_key()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inclusive_scan(select_system(system1,system2), first, last, result);
+} // end inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::inclusive_scan(select_system(system1,system2), first, last, result, binary_op);
+} // end inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result);
+} // end exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init);
+} // end exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init, binary_op);
+} // end exclusive_scan()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred, binary_op);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred, binary_op);
+}
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/scatter.inl b/compat/thrust/detail/scatter.inl
new file mode 100644
index 0000000..934addb
--- /dev/null
+++ b/compat/thrust/detail/scatter.inl
@@ -0,0 +1,159 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scatter.inl
+ *  \brief Inline file for scatter.h.
+ */
+
+#include <thrust/scatter.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/scatter.h>
+#include <thrust/system/detail/adl/scatter.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::scatter;
+  return scatter(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, output);
+} // end scatter()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::scatter_if;
+  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output);
+} // end scatter_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  using thrust::system::detail::generic::scatter_if;
+  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output, pred);
+} // end scatter_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::scatter(select_system(system1,system2,system3), first, last, map, output);
+} // end scatter()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output);
+} // end scatter_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output, pred);
+} // end scatter_if()
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/sequence.inl b/compat/thrust/detail/sequence.inl
new file mode 100644
index 0000000..f174187
--- /dev/null
+++ b/compat/thrust/detail/sequence.inl
@@ -0,0 +1,112 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sequence.inl
+ *  \brief Inline file for sequence.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/sequence.h>
+#include <thrust/system/detail/adl/sequence.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  using thrust::system::detail::generic::sequence;
+  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, step);
+} // end sequence()
+
+
+template<typename ForwardIterator>
+  void sequence(ForwardIterator first,
+                ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last);
+} // end sequence()
+
+
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last, init);
+} // end sequence()
+
+
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::sequence(select_system(system), first, last, init, step);
+} // end sequence()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/set_operations.inl b/compat/thrust/detail/set_operations.inl
new file mode 100644
index 0000000..daec461
--- /dev/null
+++ b/compat/thrust/detail/set_operations.inl
@@ -0,0 +1,836 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file set_operations.inl
+ *  \brief Inline file for set_operations.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/set_operations.h>
+#include <thrust/system/detail/adl/set_operations.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_difference;
+  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result,
+                                StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_difference;
+  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_difference_by_key;
+  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result,
+                          StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_difference_by_key;
+  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_intersection;
+  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result,
+                                  StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_intersection;
+  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_intersection_by_key;
+  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result,
+                            StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_intersection_by_key;
+  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_symmetric_difference;
+  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result,
+                                          StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_symmetric_difference;
+  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_symmetric_difference_by_key;
+  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result,
+                                    StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_symmetric_difference_by_key;
+  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result)
+{
+  using thrust::system::detail::generic::set_union;
+  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
+} // end set_union()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result,
+                           StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_union;
+  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
+} // end set_union()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result)
+{
+  using thrust::system::detail::generic::set_union_by_key;
+  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_union_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result,
+                     StrictWeakCompare                                           comp)
+{
+  using thrust::system::detail::generic::set_union_by_key;
+  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_union_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1 keys_first1,
+                          InputIterator1 keys_last1,
+                          InputIterator2 keys_first2,
+                          InputIterator2 keys_last2,
+                          InputIterator3 values_first1,
+                          InputIterator4 values_first2,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1 keys_first1,
+                          InputIterator1 keys_last1,
+                          InputIterator2 keys_first2,
+                          InputIterator2 keys_last2,
+                          InputIterator3 values_first1,
+                          InputIterator4 values_first2,
+                          OutputIterator1 keys_result,
+                          OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_intersection()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_intersection()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1 keys_first1,
+                            InputIterator1 keys_last1,
+                            InputIterator2 keys_first2,
+                            InputIterator2 keys_last2,
+                            InputIterator3 values_first1,
+                            OutputIterator1 keys_result,
+                            OutputIterator2 values_result,
+                            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+
+  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
+} // end set_intersection_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1 keys_first1,
+                            InputIterator1 keys_last1,
+                            InputIterator2 keys_first2,
+                            InputIterator2 keys_last2,
+                            InputIterator3 values_first1,
+                            OutputIterator1 keys_result,
+                            OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+
+  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
+} // end set_intersection_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_symmetric_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_symmetric_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1 keys_first1,
+                                    InputIterator1 keys_last1,
+                                    InputIterator2 keys_first2,
+                                    InputIterator2 keys_last2,
+                                    InputIterator3 values_first1,
+                                    InputIterator4 values_first2,
+                                    OutputIterator1 keys_result,
+                                    OutputIterator2 values_result,
+                                    StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1 keys_first1,
+                                    InputIterator1 keys_last1,
+                                    InputIterator2 keys_first2,
+                                    InputIterator2 keys_last2,
+                                    InputIterator3 values_first1,
+                                    InputIterator4 values_first2,
+                                    OutputIterator1 keys_result,
+                                    OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_symmetric_difference_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
+} // end set_union()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result);
+} // end set_union()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1 keys_first1,
+                     InputIterator1 keys_last1,
+                     InputIterator2 keys_first2,
+                     InputIterator2 keys_last2,
+                     InputIterator3 values_first1,
+                     InputIterator4 values_first2,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result,
+                     StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+} // end set_union_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1 keys_first1,
+                     InputIterator1 keys_last1,
+                     InputIterator2 keys_first2,
+                     InputIterator2 keys_last2,
+                     InputIterator3 values_first1,
+                     InputIterator4 values_first2,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+  System5 system5;
+  System6 system6;
+
+  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
+} // end set_union_by_key()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/sort.inl b/compat/thrust/detail/sort.inl
new file mode 100644
index 0000000..08be55a
--- /dev/null
+++ b/compat/thrust/detail/sort.inl
@@ -0,0 +1,383 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sort.inl
+ *  \brief Inline file for sort.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/sort.h>
+#include <thrust/system/detail/adl/sort.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::sort;
+  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::sort;
+  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end sort()
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::stable_sort;
+  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end stable_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::stable_sort;
+  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end stable_sort()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::sort_by_key;
+  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end sort_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::sort_by_key;
+  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::stable_sort_by_key;
+  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end stable_sort_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::stable_sort_by_key;
+  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
+} // end stable_sort_by_key()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::is_sorted;
+  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end is_sorted()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::is_sorted;
+  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end is_sorted()
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last)
+{
+  using thrust::system::detail::generic::is_sorted_until;
+  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end is_sorted_until()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  using thrust::system::detail::generic::is_sorted_until;
+  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
+} // end is_sorted_until()
+
+
+///////////////
+// Key Sorts //
+///////////////
+
+template<typename RandomAccessIterator>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::sort(select_system(system), first, last);
+} // end sort()
+
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::sort(select_system(system), first, last, comp);
+} // end sort()
+
+
+template<typename RandomAccessIterator>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::stable_sort(select_system(system), first, last);
+} // end stable_sort() 
+
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
+
+  System system;
+
+  return thrust::stable_sort(select_system(system), first, last, comp);
+} // end stable_sort()
+
+
+
+/////////////////////
+// Key-Value Sorts //
+/////////////////////
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end stable_sort_by_key()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
+} // end stable_sort_by_key()
+
+
+template<typename ForwardIterator>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted(select_system(system), first, last);
+} // end is_sorted()
+
+
+template<typename ForwardIterator,
+         typename Compare>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted(select_system(system), first, last, comp);
+} // end is_sorted()
+
+
+template<typename ForwardIterator>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted_until(select_system(system), first, last);
+} // end is_sorted_until()
+
+
+template<typename ForwardIterator,
+         typename Compare>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  using thrust::system::detail::generic::select_system;
+  
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::is_sorted_until(select_system(system), first, last, comp);
+} // end is_sorted_until()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/static_assert.h b/compat/thrust/detail/static_assert.h
new file mode 100644
index 0000000..ccc0842
--- /dev/null
+++ b/compat/thrust/detail/static_assert.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+/*
+ * (C) Copyright John Maddock 2000.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+//
+// Helper macro THRUST_JOIN (based on BOOST_JOIN):
+// The following piece of macro magic joins the two
+// arguments together, even when one of the arguments is
+// itself a macro (see 16.3.1 in C++ standard).  The key
+// is that macro expansion of macro arguments does not
+// occur in THRUST_DO_JOIN2 but does in THRUST_DO_JOIN.
+//
+#define THRUST_JOIN( X, Y ) THRUST_DO_JOIN( X, Y )
+#define THRUST_DO_JOIN( X, Y ) THRUST_DO_JOIN2(X,Y)
+#define THRUST_DO_JOIN2( X, Y ) X##Y
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// HP aCC cannot deal with missing names for template value parameters
+template <bool x> struct STATIC_ASSERTION_FAILURE;
+
+template <> struct STATIC_ASSERTION_FAILURE<true> { enum { value = 1 }; };
+
+// HP aCC cannot deal with missing names for template value parameters
+template<int x> struct static_assert_test{};
+
+template<typename, bool x>
+  struct depend_on_instantiation
+{
+  static const bool value = x;
+};
+
+} // end detail
+
+} // end thrust
+
+#define THRUST_STATIC_ASSERT( B ) \
+   typedef ::thrust::detail::static_assert_test<\
+      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE< (bool)( B ) >)>\
+         THRUST_JOIN(thrust_static_assert_typedef_, __LINE__)
+
diff --git a/compat/thrust/detail/swap.h b/compat/thrust/detail/swap.h
new file mode 100644
index 0000000..9f82ac2
--- /dev/null
+++ b/compat/thrust/detail/swap.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+template<typename Assignable1, typename Assignable2>
+__host__ __device__
+inline void swap(Assignable1 &a, Assignable2 &b)
+{
+  Assignable1 temp = a;
+  a = b;
+  b = temp;
+} // end swap()
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/swap.inl b/compat/thrust/detail/swap.inl
new file mode 100644
index 0000000..eafd70a
--- /dev/null
+++ b/compat/thrust/detail/swap.inl
@@ -0,0 +1,21 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/swap.h>
+#include <thrust/detail/swap.h>
+#include <thrust/detail/swap_ranges.inl>
+
diff --git a/compat/thrust/detail/swap_ranges.inl b/compat/thrust/detail/swap_ranges.inl
new file mode 100644
index 0000000..e3b06de
--- /dev/null
+++ b/compat/thrust/detail/swap_ranges.inl
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file swap_ranges.inl
+ *  \brief Inline file for swap_ranges.h.
+ */
+
+#include <thrust/swap.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/swap_ranges.h>
+#include <thrust/system/detail/adl/swap_ranges.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  using thrust::system::detail::generic::swap_ranges;
+  return swap_ranges(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
+} // end swap_ranges()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::swap_ranges(select_system(system1,system2), first1, last1, first2);
+} // end swap_ranges()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/tabulate.inl b/compat/thrust/detail/tabulate.inl
new file mode 100644
index 0000000..961c76e
--- /dev/null
+++ b/compat/thrust/detail/tabulate.inl
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/tabulate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/tabulate.h>
+#include <thrust/system/detail/adl/tabulate.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
+  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  using thrust::system::detail::generic::tabulate;
+  return tabulate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op);
+} // end tabulate()
+
+
+template<typename ForwardIterator, typename UnaryOperation>
+  void tabulate(ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::tabulate(select_system(system), first, last, unary_op);
+} // end tabulate()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/temporary_array.h b/compat/thrust/detail/temporary_array.h
new file mode 100644
index 0000000..3a9e084
--- /dev/null
+++ b/compat/thrust/detail/temporary_array.h
@@ -0,0 +1,158 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file temporary_array.h
+ *  \brief Container-like class temporary storage inside algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/detail/contiguous_storage.h>
+#include <thrust/detail/allocator/temporary_allocator.h>
+#include <thrust/detail/allocator/no_throw_allocator.h>
+#include <memory>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename T, typename System>
+  class temporary_array
+    : public contiguous_storage<
+               T,
+               no_throw_allocator<
+                 temporary_allocator<T,System>
+               >
+             >
+{
+  private:
+    typedef contiguous_storage<
+      T,
+      no_throw_allocator<
+        temporary_allocator<T,System>
+      >
+    > super_t;
+
+    // to help out the constructor
+    typedef no_throw_allocator<temporary_allocator<T,System> > alloc_type;
+
+  public:
+    typedef typename super_t::size_type size_type;
+
+    temporary_array(thrust::execution_policy<System> &system, size_type n);
+
+    // provide a kill-switch to explicitly avoid initialization
+    temporary_array(int uninit, thrust::execution_policy<System> &system, size_type n);
+
+    template<typename InputIterator>
+    temporary_array(thrust::execution_policy<System> &system,
+                    InputIterator first,
+                    size_type n);
+
+    template<typename InputIterator, typename InputSystem>
+    temporary_array(thrust::execution_policy<System> &system,
+                    thrust::execution_policy<InputSystem> &input_system,
+                    InputIterator first,
+                    size_type n);
+
+    template<typename InputIterator>
+    temporary_array(thrust::execution_policy<System> &system,
+                    InputIterator first,
+                    InputIterator last);
+
+    template<typename InputSystem, typename InputIterator>
+    temporary_array(thrust::execution_policy<System> &system,
+                    thrust::execution_policy<InputSystem> &input_system,
+                    InputIterator first,
+                    InputIterator last);
+
+    ~temporary_array();
+}; // end temporary_array
+
+
+// XXX eliminate this when we do ranges for real
+template<typename Iterator, typename System>
+  class tagged_iterator_range
+{
+  public:
+    typedef thrust::detail::tagged_iterator<Iterator,System> iterator;
+
+    template<typename Ignored1, typename Ignored2>
+    tagged_iterator_range(const Ignored1 &, const Ignored2 &, Iterator first, Iterator last)
+      : m_begin(reinterpret_tag<System>(first)),
+        m_end(reinterpret_tag<System>(last))
+    {}
+
+    iterator begin(void) const { return m_begin; }
+    iterator end(void) const { return m_end; }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+// if FromSystem is convertible to ToSystem, then just make a shallow
+// copy of the range. else, use a temporary_array
+// note that the resulting iterator is explicitly tagged with ToSystem either way
+template<typename Iterator, typename FromSystem, typename ToSystem>
+  struct move_to_system_base
+    : public eval_if<
+        is_convertible<
+          FromSystem,
+          ToSystem
+        >::value,
+        identity_<
+          tagged_iterator_range<Iterator,ToSystem>
+        >,
+        identity_<
+          temporary_array<
+            typename thrust::iterator_value<Iterator>::type,
+            ToSystem
+          >
+        >
+      >
+{};
+
+
+template<typename Iterator, typename FromSystem, typename ToSystem>
+  class move_to_system
+    : public move_to_system_base<
+        Iterator,
+        FromSystem,
+        ToSystem
+      >::type
+{
+  typedef typename move_to_system_base<Iterator,FromSystem,ToSystem>::type super_t;
+
+  public:
+    move_to_system(thrust::execution_policy<FromSystem> &from_system,
+                   thrust::execution_policy<ToSystem> &to_system,
+                   Iterator first,
+                   Iterator last)
+      : super_t(to_system, from_system, first, last) {}
+};
+
+
+} // end detail
+} // end thrust
+
+#include <thrust/detail/temporary_array.inl>
+
diff --git a/compat/thrust/detail/temporary_array.inl b/compat/thrust/detail/temporary_array.inl
new file mode 100644
index 0000000..36ed167
--- /dev/null
+++ b/compat/thrust/detail/temporary_array.inl
@@ -0,0 +1,148 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/type_traits.h>
+
+
+namespace thrust
+{
+
+namespace detail
+{
+namespace temporary_array_detail
+{
+
+
+template<typename T> struct avoid_initialization : thrust::detail::has_trivial_copy_constructor<T> {};
+
+
+template<typename T, typename TemporaryArray, typename Size>
+typename thrust::detail::enable_if<
+  avoid_initialization<T>::value
+>::type
+  construct_values(TemporaryArray &,
+                   Size)
+{
+  // avoid the overhead of initialization
+} // end construct_values()
+
+
+template<typename T, typename TemporaryArray, typename Size>
+typename thrust::detail::disable_if<
+  avoid_initialization<T>::value
+>::type
+  construct_values(TemporaryArray &a,
+                   Size n)
+{
+  a.default_construct_n(a.begin(), n);
+} // end construct_values()
+
+
+} // end temporary_array_detail
+
+
+template<typename T, typename System>
+  temporary_array<T,System>
+    ::temporary_array(thrust::execution_policy<System> &system, size_type n)
+      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
+{
+  temporary_array_detail::construct_values<T>(*this, n);
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  temporary_array<T,System>
+    ::temporary_array(int, thrust::execution_policy<System> &system, size_type n)
+      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
+{
+  // avoid initialization
+  ;
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator>
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        InputIterator first,
+                        size_type n)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(n);
+
+  super_t::uninitialized_copy_n(system, first, n, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator, typename InputSystem>
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        thrust::execution_policy<InputSystem> &input_system,
+                        InputIterator first,
+                        size_type n)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(n);
+
+  super_t::uninitialized_copy_n(input_system, first, n, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputIterator>
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        InputIterator first,
+                        InputIterator last)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(thrust::distance(first,last));
+
+  super_t::uninitialized_copy(system, first, last, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  template<typename InputSystem, typename InputIterator>
+    temporary_array<T,System>
+      ::temporary_array(thrust::execution_policy<System> &system,
+                        thrust::execution_policy<InputSystem> &input_system,
+                        InputIterator first,
+                        InputIterator last)
+        : super_t(alloc_type(temporary_allocator<T,System>(system)))
+{
+  super_t::allocate(thrust::distance(first,last));
+
+  super_t::uninitialized_copy(input_system, first, last, super_t::begin());
+} // end temporary_array::temporary_array()
+
+
+template<typename T, typename System>
+  temporary_array<T,System>
+    ::~temporary_array()
+{
+  // note that super_t::destroy will ignore trivial destructors automatically
+  super_t::destroy(super_t::begin(), super_t::end());
+} // end temporary_array::~temporary_array()
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/temporary_buffer.h b/compat/thrust/detail/temporary_buffer.h
new file mode 100644
index 0000000..046a3b3
--- /dev/null
+++ b/compat/thrust/detail/temporary_buffer.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/detail/generic/temporary_buffer.h>
+#include <thrust/system/detail/adl/temporary_buffer.h>
+
+namespace thrust
+{
+namespace detail
+{
+namespace get_temporary_buffer_detail
+{
+
+
+template<typename T, typename DerivedPolicy, typename Pair>
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    down_cast_pair(Pair p)
+{
+  // XXX should use a hypothetical thrust::static_pointer_cast here
+  thrust::pointer<T,DerivedPolicy> ptr = thrust::pointer<T,DerivedPolicy>(static_cast<T*>(thrust::raw_pointer_cast(p.first)));
+
+  typedef thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type> result_type;
+  return result_type(ptr, p.second);
+} // end down_cast_pair()
+
+
+} // end get_temporary_buffer_detail
+} // end detail
+
+
+template<typename T, typename DerivedPolicy>
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
+{
+  using thrust::system::detail::generic::get_temporary_buffer;
+
+  return thrust::detail::get_temporary_buffer_detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
+} // end get_temporary_buffer()
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p)
+{
+  using thrust::system::detail::generic::return_temporary_buffer;
+
+  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+} // end return_temporary_buffer()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/transform.inl b/compat/thrust/detail/transform.inl
new file mode 100644
index 0000000..ae303bc
--- /dev/null
+++ b/compat/thrust/detail/transform.inl
@@ -0,0 +1,239 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform.inl
+ *  \brief Inline file for transform.h.
+ */
+
+#include <thrust/transform.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform.h>
+#include <thrust/system/detail/adl/transform.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  using thrust::system::detail::generic::transform;
+  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op);
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  using thrust::system::detail::generic::transform;
+  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, op);
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op, pred);
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, op, pred);
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::transform_if;
+  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, stencil, result, binary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform(select_system(system1,system2), first, last, result, op);
+} // end transform()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::transform(select_system(system1,system2,system3), first1, last1, first2, result, op);
+} // end transform()
+
+
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_if(select_system(system1,system2), first, last, result, unary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System3;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+
+  return thrust::transform_if(select_system(system1,system2,system3), first, last, stencil, result, unary_op, pred);
+} // end transform_if()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::transform_if(select_system(system1,system2,system3,system4), first1, last1, first2, stencil, result, binary_op, pred);
+} // end transform_if()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/transform_reduce.inl b/compat/thrust/detail/transform_reduce.inl
new file mode 100644
index 0000000..ede6503
--- /dev/null
+++ b/compat/thrust/detail/transform_reduce.inl
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_reduce.inl
+ *  \brief Inline file for transform_reduce.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform_reduce.h>
+#include <thrust/system/detail/adl/transform_reduce.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::transform_reduce;
+  return transform_reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op, init, binary_op);
+} // end transform_reduce()
+
+
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type System;
+
+  System system;
+
+  return thrust::transform_reduce(select_system(system), first, last, unary_op, init, binary_op);
+} // end transform_reduce()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/transform_scan.inl b/compat/thrust/detail/transform_scan.inl
new file mode 100644
index 0000000..0187c4b
--- /dev/null
+++ b/compat/thrust/detail/transform_scan.inl
@@ -0,0 +1,115 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_scan.inl
+ *  \brief Inline file for transform_scan.h.
+ */
+
+#include <thrust/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/system/detail/adl/transform_scan.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::transform_inclusive_scan;
+  return transform_inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, binary_op);
+} // end transform_inclusive_scan()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::transform_exclusive_scan;
+  return transform_exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, init, binary_op);
+} // end transform_exclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+  OutputIterator transform_inclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_inclusive_scan(select_system(system1,system2), first, last, result, unary_op, binary_op);
+} // end transform_inclusive_scan()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::transform_exclusive_scan(select_system(system1,system2), first, last, result, unary_op, init, binary_op);
+} // end transform_exclusive_scan()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/trivial_sequence.h b/compat/thrust/detail/trivial_sequence.h
new file mode 100644
index 0000000..cc7e32b
--- /dev/null
+++ b/compat/thrust/detail/trivial_sequence.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file trivial_sequence.h
+ *  \brief Container-like class for wrapping sequences.  The wrapped
+ *         sequence always has trivial iterators, even when the input
+ *         sequence does not.
+ */
+
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// never instantiated
+template<typename Iterator, typename DerivedPolicy, typename is_trivial> struct _trivial_sequence { };
+
+// trivial case
+template<typename Iterator, typename DerivedPolicy>
+struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::true_type>
+{
+    typedef Iterator iterator_type;
+    Iterator first, last;
+
+    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &, Iterator _first, Iterator _last) : first(_first), last(_last)
+    {
+//        std::cout << "trivial case" << std::endl;
+    }
+
+    iterator_type begin() { return first; }
+    iterator_type end()   { return last; }
+};
+
+// non-trivial case
+template<typename Iterator, typename DerivedPolicy>
+struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::false_type>
+{
+    typedef typename thrust::iterator_value<Iterator>::type iterator_value;
+    typedef typename thrust::detail::temporary_array<iterator_value, DerivedPolicy>::iterator iterator_type;
+    
+    thrust::detail::temporary_array<iterator_value, DerivedPolicy> buffer;
+
+    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last)
+      : buffer(exec, first, last)
+    {
+//        std::cout << "non-trivial case" << std::endl;
+    }
+
+    iterator_type begin() { return buffer.begin(); }
+    iterator_type end()   { return buffer.end(); }
+};
+
+template <typename Iterator, typename DerivedPolicy>
+struct trivial_sequence
+  : detail::_trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type>
+{
+    typedef _trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type> super_t;
+
+    trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last) : super_t(exec, first, last) { }
+};
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/tuple.inl b/compat/thrust/detail/tuple.inl
new file mode 100644
index 0000000..067ad63
--- /dev/null
+++ b/compat/thrust/detail/tuple.inl
@@ -0,0 +1,948 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+
+// define null_type
+struct null_type {};
+
+// null_type comparisons
+__host__ __device__ inline
+bool operator==(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator>=(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator<=(const null_type&, const null_type&) { return true; }
+
+__host__ __device__ inline
+bool operator!=(const null_type&, const null_type&) { return false; }
+
+__host__ __device__ inline
+bool operator<(const null_type&, const null_type&) { return false; }
+
+__host__ __device__ inline
+bool operator>(const null_type&, const null_type&) { return false; }
+
+// forward declaration for tuple
+template <
+  class T0 = null_type, class T1 = null_type, class T2 = null_type,
+  class T3 = null_type, class T4 = null_type, class T5 = null_type,
+  class T6 = null_type, class T7 = null_type, class T8 = null_type,
+  class T9 = null_type>
+class tuple;
+
+// forward declaration of tuple_element
+template<int i, typename T> struct tuple_element;
+
+// specializations for tuple_element
+template<class T>
+  struct tuple_element<0,T>
+{
+  typedef typename T::head_type type;
+}; // end tuple_element<0,T>
+
+template<int N, class T>
+  struct tuple_element<N, const T>
+{
+  private:
+    typedef typename T::tail_type Next;
+    typedef typename tuple_element<N-1, Next>::type unqualified_type;
+
+  public:
+    typedef typename thrust::detail::add_const<unqualified_type>::type type;
+}; // end tuple_element<N, const T>
+
+template<class T>
+  struct tuple_element<0,const T>
+{
+  typedef typename thrust::detail::add_const<typename T::head_type>::type type;
+}; // end tuple_element<0,const T>
+
+
+
+// forward declaration of tuple_size
+template<class T> struct tuple_size;
+
+// specializations for tuple_size
+template<>
+  struct tuple_size< tuple<> >
+{
+  static const int value = 0;
+}; // end tuple_size< tuple<> >
+
+template<>
+  struct tuple_size<null_type>
+{
+  static const int value = 0;
+}; // end tuple_size<null_type>
+
+
+
+// forward declaration of detail::cons
+namespace detail
+{
+
+template <class HT, class TT> struct cons;
+
+} // end detail
+
+
+// -- some traits classes for get functions
+template <class T> struct access_traits
+{
+  typedef const T& const_type;
+  typedef T& non_const_type;
+
+  typedef const typename thrust::detail::remove_cv<T>::type& parameter_type;
+
+// used as the tuple constructors parameter types
+// Rationale: non-reference tuple element types can be cv-qualified.
+// It should be possible to initialize such types with temporaries,
+// and when binding temporaries to references, the reference must
+// be non-volatile and const. 8.5.3. (5)
+}; // end access_traits
+
+template <class T> struct access_traits<T&>
+{
+  typedef T& const_type;
+  typedef T& non_const_type;
+
+  typedef T& parameter_type;
+}; // end access_traits<T&>
+
+// forward declarations of get()
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+// XXX we probably don't need to do this for any compiler we care about -jph
+//get(cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
+get(detail::cons<HT, TT>& c);
+
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+// XXX we probably don't need to do this for any compiler we care about -jph
+//get(const cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
+get(const detail::cons<HT, TT>& c);
+
+namespace detail
+{
+
+// -- generate error template, referencing to non-existing members of this
+// template is used to produce compilation errors intentionally
+template<class T>
+class generate_error;
+
+// - cons getters --------------------------------------------------------
+// called: get_class<N>::get<RETURN_TYPE>(aTuple)
+
+template< int N >
+struct get_class
+{
+  template<class RET, class HT, class TT >
+  __host__ __device__
+  inline static RET get(const cons<HT, TT>& t)
+  {
+    // XXX we may not need to deal with this for any compiler we care about -jph
+    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
+    return get_class<N-1>::template get<RET>(t.tail);
+    
+    // gcc 4.3 couldn't compile this:
+    //return get_class<N-1>::get<RET>(t.tail);
+  }
+
+  template<class RET, class HT, class TT >
+  __host__ __device__
+  inline static RET get(cons<HT, TT>& t)
+  {
+    // XXX we may not need to deal with this for any compiler we care about -jph
+    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
+    return get_class<N-1>::template get<RET>(t.tail);
+
+    // gcc 4.3 couldn't compile this:
+    //return get_class<N-1>::get<RET>(t.tail);
+  }
+}; // end get_class
+
+template<>
+struct get_class<0>
+{
+  template<class RET, class HT, class TT>
+  __host__ __device__
+  inline static RET get(const cons<HT, TT>& t)
+  {
+    return t.head;
+  }
+
+  template<class RET, class HT, class TT>
+  __host__ __device__
+  inline static RET get(cons<HT, TT>& t)
+  {
+    return t.head;
+  }
+}; // get get_class<0>
+
+
+template <bool If, class Then, class Else> struct IF
+{
+  typedef Then RET;
+};
+
+template <class Then, class Else> struct IF<false, Then, Else>
+{
+  typedef Else RET;
+};
+
+//  These helper templates wrap void types and plain function types.
+//  The rationale is to allow one to write tuple types with those types
+//  as elements, even though it is not possible to instantiate such object.
+//  E.g: typedef tuple<void> some_type; // ok
+//  but: some_type x; // fails
+
+template <class T> class non_storeable_type
+{
+  __host__ __device__
+  non_storeable_type();
+};
+
+template <class T> struct wrap_non_storeable_type
+{
+  // XXX is_function looks complicated; punt for now -jph
+  //typedef typename IF<
+  //  ::thrust::detail::is_function<T>::value, non_storeable_type<T>, T
+  //>::RET type;
+
+  typedef T type;
+};
+
+template <> struct wrap_non_storeable_type<void>
+{
+  typedef non_storeable_type<void> type;
+};
+
+
+template <class HT, class TT>
+  struct cons
+{
+  typedef HT head_type;
+  typedef TT tail_type;
+
+  typedef typename
+    wrap_non_storeable_type<head_type>::type stored_head_type;
+
+  stored_head_type head;
+  tail_type tail;
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::non_const_type
+  get_head() { return head; }
+
+  inline __host__ __device__
+  typename access_traits<tail_type>::non_const_type
+  get_tail() { return tail; }
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::const_type
+  get_head() const { return head; }
+
+  inline __host__ __device__
+  typename access_traits<tail_type>::const_type
+  get_tail() const { return tail; }
+
+  inline __host__ __device__
+  cons(void) : head(), tail() {}
+  //  cons() : head(detail::default_arg<HT>::f()), tail() {}
+
+  // the argument for head is not strictly needed, but it prevents
+  // array type elements. This is good, since array type elements
+  // cannot be supported properly in any case (no assignment,
+  // copy works only if the tails are exactly the same type, ...)
+
+  inline __host__ __device__
+  cons(typename access_traits<stored_head_type>::parameter_type h,
+       const tail_type& t)
+    : head (h), tail(t) {}
+
+  template <class T1, class T2, class T3, class T4, class T5,
+            class T6, class T7, class T8, class T9, class T10>
+  inline __host__ __device__
+  cons( T1& t1, T2& t2, T3& t3, T4& t4, T5& t5,
+        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
+    : head (t1),
+      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
+      {}
+
+  template <class T2, class T3, class T4, class T5,
+            class T6, class T7, class T8, class T9, class T10>
+  inline __host__ __device__
+  cons( const null_type& /*t1*/, T2& t2, T3& t3, T4& t4, T5& t5,
+        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
+    : head (),
+      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
+      {}
+
+
+  template <class HT2, class TT2>
+  inline __host__ __device__
+  cons( const cons<HT2, TT2>& u ) : head(u.head), tail(u.tail) {}
+
+  template <class HT2, class TT2>
+  inline __host__ __device__
+  cons& operator=( const cons<HT2, TT2>& u ) {
+    head=u.head; tail=u.tail; return *this;
+  }
+
+  // must define assignment operator explicitly, implicit version is
+  // illformed if HT is a reference (12.8. (12))
+  inline __host__ __device__
+  cons& operator=(const cons& u) {
+    head = u.head; tail = u.tail;  return *this;
+  }
+
+  // XXX enable when we support std::pair -jph
+  //template <class T1, class T2>
+  //__host__ __device__
+  //cons& operator=( const std::pair<T1, T2>& u ) {
+  //  //BOOST_STATIC_ASSERT(length<cons>::value == 2); // check length = 2
+  //  head = u.first; tail.head = u.second; return *this;
+  //}
+
+  // get member functions (non-const and const)
+  template <int N>
+  __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, cons<HT, TT> >::type
+           >::non_const_type
+  get() {
+    return thrust::get<N>(*this); // delegate to non-member get
+  }
+
+  template <int N>
+  __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, cons<HT, TT> >::type
+           >::const_type
+  get() const {
+    return thrust::get<N>(*this); // delegate to non-member get
+  }
+
+  inline __host__ __device__
+  void swap(cons &c)
+  {
+    using thrust::swap;
+
+    swap(head, c.head);
+    tail.swap(c.tail);
+  }
+};
+
+template <class HT>
+  struct cons<HT, null_type>
+{
+  typedef HT head_type;
+  typedef null_type tail_type;
+  typedef cons<HT, null_type> self_type;
+
+  typedef typename
+    wrap_non_storeable_type<head_type>::type stored_head_type;
+  stored_head_type head;
+
+  typename access_traits<stored_head_type>::non_const_type
+  inline __host__ __device__
+  get_head() { return head; }
+
+  inline __host__ __device__
+  null_type get_tail() { return null_type(); }
+
+  inline __host__ __device__
+  typename access_traits<stored_head_type>::const_type
+  get_head() const { return head; }
+
+  inline __host__ __device__
+  null_type get_tail() const { return null_type(); }
+
+  inline __host__ __device__
+  cons() : head() {}
+
+  inline __host__ __device__
+  cons(typename access_traits<stored_head_type>::parameter_type h,
+       const null_type& = null_type())
+    : head (h) {}
+
+  template<class T1>
+  inline __host__ __device__
+  cons(T1& t1, const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&)
+  : head (t1) {}
+
+  inline __host__ __device__
+  cons(const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&,
+       const null_type&, const null_type&, const null_type&)
+  : head () {}
+
+  template <class HT2>
+  inline __host__ __device__
+  cons( const cons<HT2, null_type>& u ) : head(u.head) {}
+
+  template <class HT2>
+  inline __host__ __device__
+  cons& operator=(const cons<HT2, null_type>& u )
+  {
+    head = u.head;
+    return *this;
+  }
+
+  // must define assignment operator explicitly, implicit version
+  // is illformed if HT is a reference
+  inline __host__ __device__
+  cons& operator=(const cons& u) { head = u.head; return *this; }
+
+  template <int N>
+  inline __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, self_type>::type
+            >::non_const_type
+  // XXX we probably don't need this for the compilers we care about -jph
+  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N))
+  get(void)
+  {
+    return thrust::get<N>(*this);
+  }
+
+  template <int N>
+  inline __host__ __device__
+  typename access_traits<
+             typename tuple_element<N, self_type>::type
+           >::const_type
+  // XXX we probably don't need this for the compilers we care about -jph
+  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N)) const
+  get(void) const
+  {
+    return thrust::get<N>(*this);
+  }
+
+  inline __host__ __device__
+  void swap(cons &c)
+  {
+    using thrust::swap;
+
+    swap(head, c.head);
+  }
+}; // end cons
+
+template <class T0, class T1, class T2, class T3, class T4,
+          class T5, class T6, class T7, class T8, class T9>
+  struct map_tuple_to_cons
+{
+  typedef cons<T0,
+               typename map_tuple_to_cons<T1, T2, T3, T4, T5,
+                                          T6, T7, T8, T9, null_type>::type
+              > type;
+}; // end map_tuple_to_cons
+
+// The empty tuple is a null_type
+template <>
+  struct map_tuple_to_cons<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>
+{
+  typedef null_type type;
+}; // end map_tuple_to_cons<...>
+
+
+
+// ---------------------------------------------------------------------------
+// The call_traits for make_tuple
+
+// Must be instantiated with plain or const plain types (not with references)
+
+// from template<class T> foo(const T& t) : make_tuple_traits<const T>::type
+// from template<class T> foo(T& t) : make_tuple_traits<T>::type
+
+// Conversions:
+// T -> T,
+// references -> compile_time_error
+// array -> const ref array
+
+
+template<class T>
+struct make_tuple_traits {
+  typedef T type;
+
+  // commented away, see below  (JJ)
+  //  typedef typename IF<
+  //  boost::is_function<T>::value,
+  //  T&,
+  //  T>::RET type;
+
+};
+
+// The is_function test was there originally for plain function types,
+// which can't be stored as such (we must either store them as references or
+// pointers). Such a type could be formed if make_tuple was called with a
+// reference to a function.
+// But this would mean that a const qualified function type was formed in
+// the make_tuple function and hence make_tuple can't take a function
+// reference as a parameter, and thus T can't be a function type.
+// So is_function test was removed.
+// (14.8.3. says that type deduction fails if a cv-qualified function type
+// is created. (It only applies for the case of explicitly specifying template
+// args, though?)) (JJ)
+
+template<class T>
+struct make_tuple_traits<T&> {
+  typedef typename
+     detail::generate_error<T&>::
+       do_not_use_with_reference_type error;
+};
+
+// Arrays can't be stored as plain types; convert them to references.
+// All arrays are converted to const. This is because make_tuple takes its
+// parameters as const T& and thus the knowledge of the potential
+// non-constness of actual argument is lost.
+template<class T, int n>  struct make_tuple_traits <T[n]> {
+  typedef const T (&type)[n];
+};
+
+template<class T, int n>
+struct make_tuple_traits<const T[n]> {
+  typedef const T (&type)[n];
+};
+
+template<class T, int n>  struct make_tuple_traits<volatile T[n]> {
+  typedef const volatile T (&type)[n];
+};
+
+template<class T, int n>
+struct make_tuple_traits<const volatile T[n]> {
+  typedef const volatile T (&type)[n];
+};
+
+// XXX enable these if we ever care about reference_wrapper -jph
+//template<class T>
+//struct make_tuple_traits<reference_wrapper<T> >{
+//  typedef T& type;
+//};
+//
+//template<class T>
+//struct make_tuple_traits<const reference_wrapper<T> >{
+//  typedef T& type;
+//};
+
+
+// a helper traits to make the make_tuple functions shorter (Vesa Karvonen's
+// suggestion)
+template <
+  class T0 = null_type, class T1 = null_type, class T2 = null_type,
+  class T3 = null_type, class T4 = null_type, class T5 = null_type,
+  class T6 = null_type, class T7 = null_type, class T8 = null_type,
+  class T9 = null_type
+>
+struct make_tuple_mapper {
+  typedef
+    tuple<typename make_tuple_traits<T0>::type,
+          typename make_tuple_traits<T1>::type,
+          typename make_tuple_traits<T2>::type,
+          typename make_tuple_traits<T3>::type,
+          typename make_tuple_traits<T4>::type,
+          typename make_tuple_traits<T5>::type,
+          typename make_tuple_traits<T6>::type,
+          typename make_tuple_traits<T7>::type,
+          typename make_tuple_traits<T8>::type,
+          typename make_tuple_traits<T9>::type> type;
+};
+
+} // end detail
+
+
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+get(detail::cons<HT, TT>& c)
+{
+  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
+  
+  // gcc 4.3 couldn't compile this:
+  //return detail::get_class<N>::
+
+  return detail::get_class<N>::template
+         get<
+           typename access_traits<
+             typename tuple_element<N, detail::cons<HT, TT> >::type
+           >::non_const_type,
+           HT,TT
+         >(c);
+}
+
+
+// get function for const cons-lists, returns a const reference to
+// the element. If the element is a reference, returns the reference
+// as such (that is, can return a non-const reference)
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+get(const detail::cons<HT, TT>& c)
+{
+  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
+
+  // gcc 4.3 couldn't compile this:
+  //return detail::get_class<N>::
+
+  return detail::get_class<N>::template
+         get<
+           typename access_traits<
+             typename tuple_element<N, detail::cons<HT, TT> >::type
+           >::const_type,
+           HT,TT
+         >(c);
+}
+
+
+template<class T0>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0>::type
+    make_tuple(const T0& t0)
+{
+  typedef typename detail::make_tuple_mapper<T0>::type t;
+  return t(t0);
+} // end make_tuple()
+
+template<class T0, class T1>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1>::type
+    make_tuple(const T0& t0, const T1& t1)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1>::type t;
+  return t(t0,t1);
+} // end make_tuple()
+
+template<class T0, class T1, class T2>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2>::type t;
+  return t(t0,t1,t2);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3>::type t;
+  return t(t0,t1,t2,t3);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4>::type t;
+  return t(t0,t1,t2,t3,t4);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5>::type t;
+  return t(t0,t1,t2,t3,t4,t5);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8);
+} // end make_tuple()
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9)
+{
+  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>::type t;
+  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
+} // end make_tuple()
+
+
+template<typename T0>
+__host__ __device__ inline
+tuple<T0&> tie(T0 &t0)
+{
+  return tuple<T0&>(t0);
+}
+
+template<typename T0,typename T1>
+__host__ __device__ inline
+tuple<T0&,T1&> tie(T0 &t0, T1 &t1)
+{
+  return tuple<T0&,T1&>(t0,t1);
+}
+
+template<typename T0,typename T1, typename T2>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2)
+{
+  return tuple<T0&,T1&,T2&>(t0,t1,t2);
+}
+
+template<typename T0,typename T1, typename T2, typename T3>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3)
+{
+  return tuple<T0&,T1&,T2&,T3&>(t0,t1,t2,t3);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&>(t0,t1,t2,t3,t4);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&>(t0,t1,t2,t3,t4,t5);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>(t0,t1,t2,t3,t4,t5,t6);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>(t0,t1,t2,t3,t4,t5,t6,t7);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>(t0,t1,t2,t3,t4,t5,t6,t7,t8);
+}
+
+template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9)
+{
+  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
+}
+
+template<
+  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
+  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+>
+__host__ __device__ inline
+void swap(thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
+          thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y)
+{
+  return x.swap(y);
+}
+
+
+
+namespace detail
+{
+
+template<class T1, class T2>
+__host__ __device__
+inline bool eq(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() == rhs.get_head() &&
+         eq(lhs.get_tail(), rhs.get_tail());
+}
+template<>
+inline bool eq<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool neq(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() != rhs.get_head()  ||
+         neq(lhs.get_tail(), rhs.get_tail());
+}
+template<>
+__host__ __device__
+inline bool neq<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool lt(const T1& lhs, const T2& rhs) {
+  return (lhs.get_head() < rhs.get_head())  ||
+            (!(rhs.get_head() < lhs.get_head()) &&
+             lt(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool lt<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool gt(const T1& lhs, const T2& rhs) {
+  return (lhs.get_head() > rhs.get_head())  ||
+            (!(rhs.get_head() > lhs.get_head()) &&
+             gt(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool gt<null_type,null_type>(const null_type&, const null_type&) { return false; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool lte(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() <= rhs.get_head()  &&
+          ( !(rhs.get_head() <= lhs.get_head()) ||
+            lte(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool lte<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+template<class T1, class T2>
+__host__ __device__
+inline bool gte(const T1& lhs, const T2& rhs) {
+  return lhs.get_head() >= rhs.get_head()  &&
+          ( !(rhs.get_head() >= lhs.get_head()) ||
+            gte(lhs.get_tail(), rhs.get_tail()));
+}
+template<>
+__host__ __device__
+inline bool gte<null_type,null_type>(const null_type&, const null_type&) { return true; }
+
+} // end detail
+
+
+
+// equal ----
+
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator==(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return  detail::eq(lhs, rhs);
+} // end operator==()
+
+// not equal -----
+
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator!=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::neq(lhs, rhs);
+} // end operator!=()
+
+// <
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator<(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::lt(lhs, rhs);
+} // end operator<()
+
+// >
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator>(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::gt(lhs, rhs);
+} // end operator>()
+
+// <=
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator<=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::lte(lhs, rhs);
+} // end operator<=()
+
+// >=
+template<class T1, class T2, class S1, class S2>
+__host__ __device__
+inline bool operator>=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
+{
+  // XXX support this eventually -jph
+  //// check that tuple lengths are equal
+  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
+
+  return detail::gte(lhs, rhs);
+} // end operator>=()
+
+} // end thrust
+
diff --git a/compat/thrust/detail/tuple_meta_transform.h b/compat/thrust/detail/tuple_meta_transform.h
new file mode 100644
index 0000000..ff99709
--- /dev/null
+++ b/compat/thrust/detail/tuple_meta_transform.h
@@ -0,0 +1,177 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         unsigned int sz = thrust::tuple_size<Tuple>::value>
+  struct tuple_meta_transform;
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
+{
+  typedef null_type type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
+  > type;
+};
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction>
+  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
+{
+  typedef thrust::tuple<
+    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
+    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
+  > type;
+};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/tuple_transform.h b/compat/thrust/detail/tuple_transform.h
new file mode 100644
index 0000000..f18b872
--- /dev/null
+++ b/compat/thrust/detail/tuple_transform.h
@@ -0,0 +1,418 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/tuple.h>
+#include <thrust/detail/tuple_meta_transform.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction,
+         unsigned int sz = thrust::tuple_size<Tuple>::value>
+  struct tuple_transform_functor;
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    return thrust::null_type();
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    return thrust::null_type();
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)));
+  }
+};
+
+
+template<typename Tuple,
+         template<typename> class UnaryMetaFunction,
+         typename UnaryFunction>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
+{
+  static __host__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)),
+                     f(thrust::get<9>(t)));
+  }
+
+  static __host__ __device__
+  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
+  {
+    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
+
+    return XfrmTuple(f(thrust::get<0>(t)),
+                     f(thrust::get<1>(t)),
+                     f(thrust::get<2>(t)),
+                     f(thrust::get<3>(t)),
+                     f(thrust::get<4>(t)),
+                     f(thrust::get<5>(t)),
+                     f(thrust::get<6>(t)),
+                     f(thrust::get<7>(t)),
+                     f(thrust::get<8>(t)),
+                     f(thrust::get<9>(t)));
+  }
+};
+
+
+template<template<typename> class UnaryMetaFunction,
+         typename Tuple,
+         typename UnaryFunction>
+typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+tuple_host_transform(const Tuple &t, UnaryFunction f)
+{
+  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host(t,f);
+}
+
+template<template<typename> class UnaryMetaFunction,
+         typename Tuple,
+         typename UnaryFunction>
+typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
+__host__ __device__
+tuple_host_device_transform(const Tuple &t, UnaryFunction f)
+{
+  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host_or_device(t,f);
+}
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits.h b/compat/thrust/detail/type_traits.h
new file mode 100644
index 0000000..5dbeb90
--- /dev/null
+++ b/compat/thrust/detail/type_traits.h
@@ -0,0 +1,641 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file type_traits.h
+ *  \brief Temporarily define some type traits
+ *         until nvcc can compile tr1::type_traits.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// XXX nvcc 2.2 closed beta can't compile type_traits
+//// find type_traits
+//
+//#ifdef __GNUC__
+//
+//#if __GNUC__ == 4 && __GNUC_MINOR__ == 2
+//#include <tr1/type_traits>
+//#elif __GNUC__ == 4 && __GNUC_MINOR__ > 2
+//#include <type_traits>
+//#endif // GCC version
+//
+//#endif // GCC
+//
+//#ifdef _MSC_VER
+//#include <type_traits>
+//#endif // MSVC
+
+
+namespace thrust
+{
+
+// forward declaration of device_reference
+template<typename T> class device_reference;
+
+namespace detail
+{
+ /// helper classes [4.3].
+ template<typename _Tp, _Tp __v>
+   struct integral_constant
+   {
+     static const _Tp                      value = __v;
+     typedef _Tp                           value_type;
+     typedef integral_constant<_Tp, __v>   type;
+   };
+ 
+ /// typedef for true_type
+ typedef integral_constant<bool, true>     true_type;
+
+ /// typedef for true_type
+ typedef integral_constant<bool, false>    false_type;
+
+//template<typename T> struct is_integral : public std::tr1::is_integral<T> {};
+template<typename T> struct is_integral                           : public false_type {};
+template<>           struct is_integral<bool>                     : public true_type {};
+template<>           struct is_integral<char>                     : public true_type {};
+template<>           struct is_integral<signed char>              : public true_type {};
+template<>           struct is_integral<unsigned char>            : public true_type {};
+template<>           struct is_integral<short>                    : public true_type {};
+template<>           struct is_integral<unsigned short>           : public true_type {};
+template<>           struct is_integral<int>                      : public true_type {};
+template<>           struct is_integral<unsigned int>             : public true_type {};
+template<>           struct is_integral<long>                     : public true_type {};
+template<>           struct is_integral<unsigned long>            : public true_type {};
+template<>           struct is_integral<long long>                : public true_type {};
+template<>           struct is_integral<unsigned long long>       : public true_type {};
+template<>           struct is_integral<const bool>               : public true_type {};
+template<>           struct is_integral<const char>               : public true_type {};
+template<>           struct is_integral<const unsigned char>      : public true_type {};
+template<>           struct is_integral<const short>              : public true_type {};
+template<>           struct is_integral<const unsigned short>     : public true_type {};
+template<>           struct is_integral<const int>                : public true_type {};
+template<>           struct is_integral<const unsigned int>       : public true_type {};
+template<>           struct is_integral<const long>               : public true_type {};
+template<>           struct is_integral<const unsigned long>      : public true_type {};
+template<>           struct is_integral<const long long>          : public true_type {};
+template<>           struct is_integral<const unsigned long long> : public true_type {};
+
+template<typename T> struct is_floating_point              : public false_type {};
+template<>           struct is_floating_point<float>       : public true_type {};
+template<>           struct is_floating_point<double>      : public true_type {};
+template<>           struct is_floating_point<long double> : public true_type {};
+
+template<typename T> struct is_arithmetic               : public is_integral<T> {};
+template<>           struct is_arithmetic<float>        : public true_type {};
+template<>           struct is_arithmetic<double>       : public true_type {};
+template<>           struct is_arithmetic<const float>  : public true_type {};
+template<>           struct is_arithmetic<const double> : public true_type {};
+
+template<typename T> struct is_pointer      : public false_type {};
+template<typename T> struct is_pointer<T *> : public true_type  {};
+
+template<typename T> struct is_device_ptr  : public false_type {};
+
+template<typename T> struct is_void             : public false_type {};
+template<>           struct is_void<void>       : public true_type {};
+template<>           struct is_void<const void> : public true_type {};
+
+
+namespace tt_detail
+{
+
+
+} // end tt_detail
+
+template<typename T> struct is_pod
+   : public integral_constant<
+       bool,
+       is_void<T>::value || is_pointer<T>::value || is_arithmetic<T>::value
+#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
+// use intrinsic type traits
+       || __is_pod(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+       || __is_pod(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+     >
+ {};
+
+
+template<typename T> struct has_trivial_constructor
+  : public integral_constant<
+      bool,
+      is_pod<T>::value
+#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
+      || __has_trivial_constructor(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_constructor(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+      >
+{};
+
+template<typename T> struct has_trivial_copy_constructor
+  : public integral_constant<
+      bool,
+      is_pod<T>::value
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+      || __has_trivial_copy(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_copy(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+    >
+{};
+
+template<typename T> struct has_trivial_destructor : public is_pod<T> {};
+
+template<typename T> struct is_const          : public false_type {};
+template<typename T> struct is_const<const T> : public true_type {};
+
+template<typename T> struct is_volatile             : public false_type {};
+template<typename T> struct is_volatile<volatile T> : public true_type {};
+
+template<typename T>
+  struct add_const
+{
+  typedef T const type;
+}; // end add_const
+
+template<typename T>
+  struct remove_const
+{
+  typedef T type;
+}; // end remove_const
+
+template<typename T>
+  struct remove_const<const T>
+{
+  typedef T type;
+}; // end remove_const
+
+template<typename T>
+  struct add_volatile
+{
+  typedef volatile T type;
+}; // end add_volatile
+
+template<typename T>
+  struct remove_volatile
+{
+  typedef T type;
+}; // end remove_volatile
+
+template<typename T>
+  struct remove_volatile<volatile T>
+{
+  typedef T type;
+}; // end remove_volatile
+
+template<typename T>
+  struct add_cv
+{
+  typedef const volatile T type;
+}; // end add_cv
+
+template<typename T>
+  struct remove_cv
+{
+  typedef typename remove_const<typename remove_volatile<T>::type>::type type;
+}; // end remove_cv
+
+
+template<typename T> struct is_reference     : public false_type {};
+template<typename T> struct is_reference<T&> : public true_type {};
+
+template<typename T> struct is_device_reference                                : public false_type {};
+template<typename T> struct is_device_reference< thrust::device_reference<T> > : public true_type {};
+
+
+// NB: Careful with reference to void.
+template<typename _Tp, bool = (is_void<_Tp>::value || is_reference<_Tp>::value)>
+  struct __add_reference_helper
+  { typedef _Tp&    type; };
+
+template<typename _Tp>
+  struct __add_reference_helper<_Tp, true>
+  { typedef _Tp     type; };
+
+template<typename _Tp>
+  struct add_reference
+    : public __add_reference_helper<_Tp>{};
+
+template<typename T>
+  struct remove_reference
+{
+  typedef T type;
+}; // end remove_reference
+
+template<typename T>
+  struct remove_reference<T&>
+{
+  typedef T type;
+}; // end remove_reference
+
+template<typename T1, typename T2>
+  struct is_same
+    : public false_type
+{
+}; // end is_same
+
+template<typename T>
+  struct is_same<T,T>
+    : public true_type
+{
+}; // end is_same
+
+template<typename T1, typename T2>
+  struct lazy_is_same
+    : is_same<typename T1::type, typename T2::type>
+{
+}; // end lazy_is_same
+
+template<typename T1, typename T2>
+  struct is_different
+    : public true_type
+{
+}; // end is_different
+
+template<typename T>
+  struct is_different<T,T>
+    : public false_type
+{
+}; // end is_different
+
+template<typename T1, typename T2>
+  struct lazy_is_different
+    : is_different<typename T1::type, typename T2::type>
+{
+}; // end lazy_is_different
+
+namespace tt_detail
+{
+
+template<typename T>
+  struct is_int_or_cref
+{
+  typedef typename remove_reference<T>::type type_sans_ref;
+  static const bool value = (is_integral<T>::value
+                             || (is_integral<type_sans_ref>::value
+                                 && is_const<type_sans_ref>::value
+                                 && !is_volatile<type_sans_ref>::value));
+}; // end is_int_or_cref
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN
+
+
+template<typename From, typename To>
+  struct is_convertible_sfinae
+{
+  private:
+    typedef char                          one_byte;
+    typedef struct { char two_chars[2]; } two_bytes;
+
+    static one_byte  test(To);
+    static two_bytes test(...);
+    static From      m_from;
+
+  public:
+    static const bool value = sizeof(test(m_from)) == sizeof(one_byte);
+}; // end is_convertible_sfinae
+
+
+__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+
+template<typename From, typename To>
+  struct is_convertible_needs_simple_test
+{
+  static const bool from_is_void      = is_void<From>::value;
+  static const bool to_is_void        = is_void<To>::value;
+  static const bool from_is_float     = is_floating_point<typename remove_reference<From>::type>::value;
+  static const bool to_is_int_or_cref = is_int_or_cref<To>::value;
+
+  static const bool value = (from_is_void || to_is_void || (from_is_float && to_is_int_or_cref));
+}; // end is_convertible_needs_simple_test
+
+
+template<typename From, typename To,
+         bool = is_convertible_needs_simple_test<From,To>::value>
+  struct is_convertible
+{
+  static const bool value = (is_void<To>::value
+                             || (is_int_or_cref<To>::value
+                                 && !is_void<From>::value));
+}; // end is_convertible
+
+
+template<typename From, typename To>
+  struct is_convertible<From, To, false>
+{
+  static const bool value = (is_convertible_sfinae<typename
+                             add_reference<From>::type, To>::value);
+}; // end is_convertible
+
+
+} // end tt_detail
+
+template<typename From, typename To>
+  struct is_convertible
+    : public integral_constant<bool, tt_detail::is_convertible<From, To>::value>
+{
+}; // end is_convertible
+
+
+template<typename T1, typename T2>
+  struct is_one_convertible_to_the_other
+    : public integral_constant<
+        bool,
+        is_convertible<T1,T2>::value || is_convertible<T2,T1>::value
+      >
+{};
+
+
+// mpl stuff
+
+template <typename Condition1,               typename Condition2,              typename Condition3 = false_type,
+          typename Condition4  = false_type, typename Condition5 = false_type, typename Condition6 = false_type,
+          typename Condition7  = false_type, typename Condition8 = false_type, typename Condition9 = false_type,
+          typename Condition10 = false_type>
+  struct or_
+    : public integral_constant<
+        bool,
+        Condition1::value || Condition2::value || Condition3::value || Condition4::value || Condition5::value || Condition6::value || Condition7::value || Condition8::value || Condition9::value || Condition10::value
+      >
+{
+}; // end or_
+
+template <typename Condition1, typename Condition2, typename Condition3 = true_type>
+  struct and_
+    : public integral_constant<bool, Condition1::value && Condition2::value && Condition3::value>
+{
+}; // end and_
+
+template <typename Boolean>
+  struct not_
+    : public integral_constant<bool, !Boolean::value>
+{
+}; // end not_
+
+template <bool, typename Then, typename Else>
+  struct eval_if
+{
+}; // end eval_if
+
+template<typename Then, typename Else>
+  struct eval_if<true, Then, Else>
+{
+  typedef typename Then::type type;
+}; // end eval_if
+
+template<typename Then, typename Else>
+  struct eval_if<false, Then, Else>
+{
+  typedef typename Else::type type;
+}; // end eval_if
+
+template<typename T>
+//  struct identity
+//  XXX WAR nvcc's confusion with thrust::identity
+  struct identity_
+{
+  typedef T type;
+}; // end identity
+
+template<bool, typename T = void> struct enable_if {};
+template<typename T>              struct enable_if<true, T> {typedef T type;};
+
+template<bool, typename T> struct lazy_enable_if {};
+template<typename T>       struct lazy_enable_if<true, T> {typedef typename T::type type;};
+
+template<bool condition, typename T = void> struct disable_if : enable_if<!condition, T> {};
+template<bool condition, typename T>        struct lazy_disable_if : lazy_enable_if<!condition, T> {};
+
+
+template<typename T1, typename T2, typename T = void>
+  struct enable_if_convertible
+    : enable_if< is_convertible<T1,T2>::value, T >
+{};
+
+
+template<typename T1, typename T2, typename T = void>
+  struct disable_if_convertible
+    : disable_if< is_convertible<T1,T2>::value, T >
+{};
+
+
+template<typename T1, typename T2, typename Result = void>
+  struct enable_if_different
+    : enable_if<is_different<T1,T2>::value, Result>
+{};
+
+
+template<typename T>
+  struct is_numeric
+    : and_<
+        is_convertible<int,T>,
+        is_convertible<T,int>
+      >
+{
+}; // end is_numeric
+
+
+template<typename> struct is_reference_to_const             : false_type {};
+template<typename T> struct is_reference_to_const<const T&> : true_type {};
+
+
+// make_unsigned follows
+
+namespace tt_detail
+{
+
+template<typename T> struct make_unsigned_simple;
+
+template<> struct make_unsigned_simple<char>                   { typedef unsigned char          type; };
+template<> struct make_unsigned_simple<signed char>            { typedef signed   char          type; };
+template<> struct make_unsigned_simple<unsigned char>          { typedef unsigned char          type; };
+template<> struct make_unsigned_simple<short>                  { typedef unsigned short         type; };
+template<> struct make_unsigned_simple<unsigned short>         { typedef unsigned short         type; };
+template<> struct make_unsigned_simple<int>                    { typedef unsigned int           type; };
+template<> struct make_unsigned_simple<unsigned int>           { typedef unsigned int           type; };
+template<> struct make_unsigned_simple<long int>               { typedef unsigned long int      type; };
+template<> struct make_unsigned_simple<unsigned long int>      { typedef unsigned long int      type; };
+template<> struct make_unsigned_simple<long long int>          { typedef unsigned long long int type; };
+template<> struct make_unsigned_simple<unsigned long long int> { typedef unsigned long long int type; };
+
+template<typename T>
+  struct make_unsigned_base
+{
+  // remove cv
+  typedef typename remove_cv<T>::type remove_cv_t;
+
+  // get the simple unsigned type
+  typedef typename make_unsigned_simple<remove_cv_t>::type unsigned_remove_cv_t;
+
+  // add back const, volatile, both, or neither to the simple result
+  typedef typename eval_if<
+    is_const<T>::value && is_volatile<T>::value,
+    // add cv back
+    add_cv<unsigned_remove_cv_t>,
+    // check const & volatile individually
+    eval_if<
+      is_const<T>::value,
+      // add c back
+      add_const<unsigned_remove_cv_t>,
+      eval_if<
+        is_volatile<T>::value,
+        // add v back
+        add_volatile<unsigned_remove_cv_t>,
+        // original type was neither cv, return the simple unsigned result
+        identity_<unsigned_remove_cv_t>
+      >
+    >
+  >::type type;
+};
+
+} // end tt_detail
+
+template<typename T>
+  struct make_unsigned
+    : tt_detail::make_unsigned_base<T>
+{};
+
+struct largest_available_float
+{
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ < 130)
+  typedef float type;
+#  else
+  typedef double type;
+#  endif
+#else
+  typedef double type;
+#endif
+};
+
+// T1 wins if they are both the same size
+template<typename T1, typename T2>
+  struct larger_type
+    : thrust::detail::eval_if<
+        (sizeof(T2) > sizeof(T1)),
+        thrust::detail::identity_<T2>,
+        thrust::detail::identity_<T1>
+      >
+{};
+
+
+namespace is_base_of_ns
+{
+
+typedef char                          yes;
+typedef struct { char two_chars[2]; } no;
+
+template<typename Base, typename Derived>
+  struct host
+{
+  operator Base*() const;
+  operator Derived*();
+}; // end host
+
+template<typename Base, typename Derived>
+  struct impl
+{
+  template<typename T> static yes check(Derived *, T);
+  static no check(Base*, int);
+
+  static const bool value = sizeof(check(host<Base,Derived>(), int())) == sizeof(yes);
+}; // end impl
+
+} // end is_base_of_ns
+
+
+template<typename Base, typename Derived>
+  struct is_base_of
+    : integral_constant<
+        bool,
+        is_base_of_ns::impl<Base,Derived>::value
+      >
+{};
+
+template<typename Base, typename Derived, typename Result = void>
+  struct enable_if_base_of
+    : enable_if<
+        is_base_of<Base,Derived>::value,
+        Result
+      >
+{};
+
+
+namespace is_assignable_ns
+{
+
+template<typename T1, typename T2>
+  class is_assignable
+{
+  typedef char                      yes_type;
+  typedef struct { char array[2]; } no_type;
+
+  template<typename T> static typename add_reference<T>::type declval();
+  
+  template<unsigned int> struct helper { typedef void * type; };
+
+  template<typename U1, typename U2> static yes_type test(typename helper<sizeof(declval<U1>() = declval<U2>())>::type);
+
+  template<typename,typename> static no_type test(...);
+
+  public:
+    static const bool value = sizeof(test<T1,T2>(0)) == 1;
+}; // end is_assignable
+
+} // end is_assignable_ns
+
+
+template<typename T1, typename T2>
+  struct is_assignable
+    : integral_constant<
+        bool,
+        is_assignable_ns::is_assignable<T1,T2>::value
+      >
+{};
+
+
+template<typename T>
+  struct is_copy_assignable
+    : is_assignable<
+        typename add_reference<T>::type,
+        typename add_reference<typename add_const<T>::type>::type
+      >
+{};
+
+
+} // end detail
+
+} // end thrust
+
+#include <thrust/detail/type_traits/has_trivial_assign.h>
+
diff --git a/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h b/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
new file mode 100644
index 0000000..92767b5
--- /dev/null
+++ b/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// this trait reports what type should be used as a temporary in certain algorithms
+// which aggregate intermediate results from a function before writing to an output iterator
+
+// the pseudocode for deducing the type of the temporary used below:
+// 
+// if Function is an AdaptableFunction
+//   result = Function::result_type
+// else if OutputIterator2 is a "pure" output iterator
+//   result = InputIterator2::value_type
+// else
+//   result = OutputIterator2::value_type
+//
+// XXX upon c++0x, TemporaryType needs to be:
+// result_of<BinaryFunction>::type
+template<typename InputIterator, typename OutputIterator, typename Function>
+  struct intermediate_type_from_function_and_iterators
+    : eval_if<
+        has_result_type<Function>::value,
+        result_type<Function>,
+        eval_if<
+          is_output_iterator<OutputIterator>::value,
+          thrust::iterator_value<InputIterator>,
+          thrust::iterator_value<OutputIterator>
+        >
+      >
+{
+}; // end intermediate_type_from_function_and_iterators
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/function_traits.h b/compat/thrust/detail/type_traits/function_traits.h
new file mode 100644
index 0000000..39015c6
--- /dev/null
+++ b/compat/thrust/detail/type_traits/function_traits.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+
+namespace thrust
+{
+
+// forward definitions for is_commutative
+template <typename T> struct plus;
+template <typename T> struct multiplies;
+template <typename T> struct minimum;
+template <typename T> struct maximum;
+template <typename T> struct logical_or;
+template <typename T> struct logical_and;
+template <typename T> struct bit_or;
+template <typename T> struct bit_and;
+template <typename T> struct bit_xor;
+
+namespace detail
+{
+
+
+// some metafunctions which check for the nested types of the adaptable functions
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_result_type, result_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_argument_type, argument_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_first_argument_type, first_argument_type)
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_second_argument_type, second_argument_type)
+
+
+template<typename AdaptableBinaryFunction>
+  struct result_type
+{
+  typedef typename AdaptableBinaryFunction::result_type type;
+};
+
+
+template<typename T>
+  struct is_adaptable_unary_function
+    : thrust::detail::and_<
+        has_result_type<T>,
+        has_argument_type<T>
+      >
+{};
+
+
+template<typename T>
+  struct is_adaptable_binary_function
+    : thrust::detail::and_<
+        has_result_type<T>,
+        thrust::detail::and_<
+          has_first_argument_type<T>,
+          has_second_argument_type<T>
+        >
+      >
+{};
+
+
+template<typename BinaryFunction>
+  struct is_commutative
+    : public thrust::detail::false_type
+{};
+
+template<typename T> struct is_commutative< typename thrust::plus<T>        > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::multiplies<T>  > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::minimum<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::maximum<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::logical_or<T>  > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::logical_and<T> > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_or<T>      > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_and<T>     > : public thrust::detail::is_arithmetic<T> {};
+template<typename T> struct is_commutative< typename thrust::bit_xor<T>     > : public thrust::detail::is_arithmetic<T> {};
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/type_traits/has_member_function.h b/compat/thrust/detail/type_traits/has_member_function.h
new file mode 100644
index 0000000..117f4cb
--- /dev/null
+++ b/compat/thrust/detail/type_traits/has_member_function.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)                                \
+template<typename T, typename Signature> class trait_name;                                                   \
+                                                                                                             \
+template<typename T, typename Result>                                                                        \
+class trait_name<T, Result(void)>                                                                            \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name();                                                                          \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(), &U::member_function_name>* = 0);                    \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg>                                                          \
+class trait_name<T, Result(Arg)>                                                                             \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg);                                                                       \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg), &U::member_function_name>* = 0);                 \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2>                                          \
+class trait_name<T, Result(Arg1,Arg2)>                                                                       \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2);                                                                 \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2), &U::member_function_name>* = 0);           \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3>                           \
+class trait_name<T, Result(Arg1,Arg2,Arg3)>                                                                  \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2,Arg3);                                                            \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3), &U::member_function_name>* = 0);      \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           \
+                                                                                                             \
+template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>            \
+class trait_name<T, Result(Arg1,Arg2,Arg3,Arg4)>                                                             \
+{                                                                                                            \
+   class yes { char m; };                                                                                    \
+   class no { yes m[2]; };                                                                                   \
+   struct base_mixin                                                                                         \
+   {                                                                                                         \
+     Result member_function_name(Arg1,Arg2,Arg3,Arg4);                                                       \
+   };                                                                                                        \
+   struct base : public T, public base_mixin {};                                                             \
+   template <typename U, U t>  class helper{};                                                               \
+   template <typename U>                                                                                     \
+   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3,Arg4), &U::member_function_name>* = 0); \
+   static yes deduce(...);                                                                                   \
+public:                                                                                                      \
+   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
+   typedef thrust::detail::integral_constant<bool,value> type;                                               \
+};                                                                                                           
+
diff --git a/compat/thrust/detail/type_traits/has_nested_type.h b/compat/thrust/detail/type_traits/has_nested_type.h
new file mode 100644
index 0000000..98c9460
--- /dev/null
+++ b/compat/thrust/detail/type_traits/has_nested_type.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+#define __THRUST_DEFINE_HAS_NESTED_TYPE(trait_name, nested_type_name) \
+template<typename T> \
+  struct trait_name  \
+{                    \
+  typedef char yes_type; \
+  typedef int  no_type;  \
+  template<typename S> static yes_type test(typename S::nested_type_name *); \
+  template<typename S> static no_type  test(...); \
+  static bool const value = sizeof(test<T>(0)) == sizeof(yes_type);\
+  typedef thrust::detail::integral_constant<bool, value> type;\
+};
+
diff --git a/compat/thrust/detail/type_traits/has_trivial_assign.h b/compat/thrust/detail/type_traits/has_trivial_assign.h
new file mode 100644
index 0000000..d248245
--- /dev/null
+++ b/compat/thrust/detail/type_traits/has_trivial_assign.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file type_traits.h
+ *  \brief Temporarily define some type traits
+ *         until nvcc can compile tr1::type_traits.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T> struct has_trivial_assign
+  : public integral_constant<
+      bool,
+      (is_pod<T>::value && !is_const<T>::value)
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+      || __has_trivial_assign(T)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+// only use the intrinsic for >= 4.3
+#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+      || __has_trivial_assign(T)
+#endif // GCC VERSION
+#endif // THRUST_HOST_COMPILER
+    >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/is_call_possible.h b/compat/thrust/detail/type_traits/is_call_possible.h
new file mode 100644
index 0000000..41b9539
--- /dev/null
+++ b/compat/thrust/detail/type_traits/is_call_possible.h
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_member_function.h>
+
+// inspired by Roman Perepelitsa's presentation from comp.lang.c++.moderated
+// based on the implementation here: http://www.rsdn.ru/forum/cpp/2759773.1.aspx
+
+namespace thrust
+{
+namespace detail
+{
+namespace is_call_possible_detail
+{
+
+template<typename T> class void_exp_result {}; 
+
+template<typename T, typename U> 
+U const& operator,(U const&, void_exp_result<T>); 
+
+template<typename T, typename U> 
+U& operator,(U&, void_exp_result<T>); 
+
+template<typename src_type, typename dest_type> 
+struct clone_constness 
+{
+  typedef dest_type type; 
+}; 
+
+template<typename src_type, typename dest_type> 
+struct clone_constness<const src_type, dest_type> 
+{ 
+  typedef const dest_type type; 
+};
+
+} // end is_call_possible_detail
+} // end detail
+} // end thrust
+
+#define __THRUST_DEFINE_IS_CALL_POSSIBLE(trait_name, member_function_name)                                                                \
+__THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name##_has_member, member_function_name)                                                        \
+                                                                                                                                          \
+template <typename T, typename Signature>                                                                                                 \
+struct trait_name                                                                                                                         \
+{                                                                                                                                         \
+  private:                                                                                                                                \
+    struct yes {};                                                                                                                        \
+    struct no { yes m[2]; };                                                                                                              \
+    struct derived : public T                                                                                                             \
+    {                                                                                                                                     \
+      using T::member_function_name;                                                                                                      \
+      no member_function_name(...) const;                                                                                                 \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    typedef typename thrust::detail::is_call_possible_detail::clone_constness<T, derived>::type derived_type;                             \
+                                                                                                                                          \
+    template<typename U, typename Result>                                                                                                 \
+    struct return_value_check                                                                                                             \
+    {                                                                                                                                     \
+      static yes deduce(Result);                                                                                                          \
+      static no deduce(...);                                                                                                              \
+      static no deduce(no);                                                                                                               \
+      static no deduce(thrust::detail::is_call_possible_detail::void_exp_result<T>);                                                      \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename U>                                                                                                                  \
+    struct return_value_check<U, void>                                                                                                    \
+    {                                                                                                                                     \
+      static yes deduce(...);                                                                                                             \
+      static no deduce(no);                                                                                                               \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<bool has_the_member_of_interest, typename F>                                                                                 \
+    struct impl                                                                                                                           \
+    {                                                                                                                                     \
+      static const bool value = false;                                                                                                    \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg>                                                                                               \
+    struct impl<true, Result(Arg)>                                                                                                        \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg>::type          arg;                                                                              \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg), thrust::detail::is_call_possible_detail::void_exp_result<T>())                      \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2>                                                                               \
+    struct impl<true, Result(Arg1,Arg2)>                                                                                                  \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2), thrust::detail::is_call_possible_detail::void_exp_result<T>())                \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2, typename Arg3>                                                                \
+    struct impl<true, Result(Arg1,Arg2,Arg3)>                                                                                             \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+      static typename add_reference<Arg3>::type         arg3;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2,arg3), thrust::detail::is_call_possible_detail::void_exp_result<T>())           \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+    template<typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>                                                 \
+    struct impl<true, Result(Arg1,Arg2,Arg3,Arg4)>                                                                                        \
+    {                                                                                                                                     \
+      static typename add_reference<derived_type>::type test_me;                                                                          \
+      static typename add_reference<Arg1>::type         arg1;                                                                             \
+      static typename add_reference<Arg2>::type         arg2;                                                                             \
+      static typename add_reference<Arg3>::type         arg3;                                                                             \
+      static typename add_reference<Arg4>::type         arg4;                                                                             \
+                                                                                                                                          \
+      static const bool value =                                                                                                           \
+        sizeof(                                                                                                                           \
+                return_value_check<T, Result>::deduce(                                                                                    \
+                  (test_me.member_function_name(arg1,arg2,arg3,arg4), thrust::detail::is_call_possible_detail::void_exp_result<T>())      \
+                )                                                                                                                         \
+              ) == sizeof(yes);                                                                                                           \
+    };                                                                                                                                    \
+                                                                                                                                          \
+  public:                                                                                                                                 \
+    static const bool value = impl<trait_name##_has_member<T,Signature>::value, Signature>::value;                                        \
+    typedef thrust::detail::integral_constant<bool,value> type;                                                                           \
+}; 
+
diff --git a/compat/thrust/detail/type_traits/is_metafunction_defined.h b/compat/thrust/detail/type_traits/is_metafunction_defined.h
new file mode 100644
index 0000000..fba0811
--- /dev/null
+++ b/compat/thrust/detail/type_traits/is_metafunction_defined.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+__THRUST_DEFINE_HAS_NESTED_TYPE(is_metafunction_defined, type)
+
+template<typename Metafunction>
+  struct enable_if_defined
+    : thrust::detail::lazy_enable_if<
+        is_metafunction_defined<Metafunction>::value,
+        Metafunction
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h b/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h
new file mode 100644
index 0000000..cca59da
--- /dev/null
+++ b/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename Iterator>
+struct is_discard_iterator
+  : public thrust::detail::false_type
+{};
+
+template <typename System>
+struct is_discard_iterator< thrust::discard_iterator<System> >
+ : public thrust::detail::true_type
+{};
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/type_traits/iterator/is_output_iterator.h b/compat/thrust/detail/type_traits/iterator/is_output_iterator.h
new file mode 100644
index 0000000..4cefe63
--- /dev/null
+++ b/compat/thrust/detail/type_traits/iterator/is_output_iterator.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/any_assign.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+
+template<typename T>
+  struct is_void_like
+    : thrust::detail::or_<
+        thrust::detail::is_void<T>,
+        thrust::detail::is_same<T,thrust::detail::any_assign>
+      >
+{}; // end is_void_like
+
+
+template<typename T>
+  struct lazy_is_void_like
+    : is_void_like<typename T::type>
+{}; // end lazy_is_void_like
+
+
+// XXX this meta function should first check that T is actually an iterator
+//
+//     if thrust::iterator_value<T> is defined and thrust::iterator_value<T>::type == void
+//       return false
+//     else
+//       return true
+template<typename T>
+  struct is_output_iterator
+    : eval_if<
+        is_metafunction_defined<thrust::iterator_value<T> >::value,
+        lazy_is_void_like<thrust::iterator_value<T> >,
+        thrust::detail::true_type
+      >::type
+{
+}; // end is_output_iterator
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/minimum_type.h b/compat/thrust/detail/type_traits/minimum_type.h
new file mode 100644
index 0000000..aaa011e
--- /dev/null
+++ b/compat/thrust/detail/type_traits/minimum_type.h
@@ -0,0 +1,162 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{ 
+
+namespace minimum_type_detail
+{
+
+//
+// Returns the minimum type or is empty
+// if T1 and T2 are unrelated.
+//
+template <typename T1, typename T2, bool GreaterEqual, bool LessEqual> struct minimum_type_impl {};
+  
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,true,false>
+{
+  typedef T2 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,false,true>
+{
+  typedef T1 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct minimum_type_impl<T1,T2,true,true>
+{
+  typedef T1 type;
+}; // end minimum_type_impl
+
+template <typename T1, typename T2>
+struct primitive_minimum_type
+  : minimum_type_detail::minimum_type_impl<
+      T1,
+      T2,
+      ::thrust::detail::is_convertible<T1,T2>::value,
+      ::thrust::detail::is_convertible<T2,T1>::value
+    >
+{
+}; // end primitive_minimum_type
+
+// because some types are not convertible (even to themselves)
+// specialize primitive_minimum_type for when both types are identical
+template <typename T>
+struct primitive_minimum_type<T,T>
+{
+  typedef T type;
+}; // end primitive_minimum_type
+
+// XXX this belongs somewhere more general
+struct any_conversion
+{
+  template<typename T> operator T (void);
+};
+
+} // end minimum_type_detail
+
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_type;
+
+// base case
+template<typename T1, typename T2>
+  struct minimum_type<T1,T2>
+    : minimum_type_detail::primitive_minimum_type<T1,T2>
+{};
+
+template<typename T1, typename T2>
+  struct lazy_minimum_type
+    : minimum_type<
+        typename T1::type,
+        typename T2::type
+      >
+{};
+
+// carefully avoid referring to a nested ::type which may not exist
+template<typename T1,  typename T2,  typename T3,  typename T4,
+         typename T5,  typename T6,  typename T7,  typename T8,
+         typename T9,  typename T10, typename T11, typename T12,
+         typename T13, typename T14, typename T15, typename T16>
+  struct minimum_type
+    : lazy_minimum_type<
+        lazy_minimum_type<
+          lazy_minimum_type<
+            minimum_type<
+              T1,T2
+            >,
+            minimum_type<
+              T3,T4
+            >
+          >,
+          lazy_minimum_type<
+            minimum_type<
+              T5,T6
+            >,
+            minimum_type<
+              T7,T8
+            >
+          >
+        >,
+        lazy_minimum_type<
+          lazy_minimum_type<
+            minimum_type<
+              T9,T10
+            >,
+            minimum_type<
+              T11,T12
+            >
+          >,
+          lazy_minimum_type<
+            minimum_type<
+              T13,T14
+            >,
+            minimum_type<
+              T15,T16
+            >
+          >
+        >
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/pointer_traits.h b/compat/thrust/detail/type_traits/pointer_traits.h
new file mode 100644
index 0000000..a0b5dc6
--- /dev/null
+++ b/compat/thrust/detail/type_traits/pointer_traits.h
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <cstddef>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename Ptr> struct pointer_element;
+
+template<template<typename> class Ptr, typename Arg>
+  struct pointer_element<Ptr<Arg> >
+{
+  typedef Arg type;
+};
+
+template<template<typename,typename> class Ptr, typename Arg1, typename Arg2>
+  struct pointer_element<Ptr<Arg1,Arg2> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4> >
+{
+  typedef Arg1 type;
+};
+
+template<template<typename,typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
+  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4,Arg5> >
+{
+  typedef Arg1 type;
+};
+
+template<typename T>
+  struct pointer_element<T*>
+{
+  typedef T type;
+};
+
+template<typename Ptr>
+  struct pointer_difference
+{
+  typedef typename Ptr::difference_type type;
+};
+
+template<typename T>
+  struct pointer_difference<T*>
+{
+  typedef std::ptrdiff_t type;
+};
+
+template<typename Ptr, typename T> struct rebind_pointer;
+
+template<typename T, typename U>
+  struct rebind_pointer<T*,U>
+{
+  typedef U* type;
+};
+
+template<template<typename> class Ptr, typename Arg, typename T>
+  struct rebind_pointer<Ptr<Arg>,T>
+{
+  typedef Ptr<T> type;
+};
+
+template<template<typename, typename> class Ptr, typename Arg1, typename Arg2, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2>,T>
+{
+  typedef Ptr<T,Arg2> type;
+};
+
+template<template<typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3>,T>
+{
+  typedef Ptr<T,Arg2,Arg3> type;
+};
+
+template<template<typename, typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename T>
+  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3,Arg4>,T>
+{
+  typedef Ptr<T,Arg2,Arg3,Arg4> type;
+};
+
+// XXX this should probably be renamed native_type or similar
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
+
+namespace pointer_traits_detail
+{
+
+template<typename Ptr, typename Enable = void> struct pointer_raw_pointer_impl {};
+
+template<typename T>
+  struct pointer_raw_pointer_impl<T*>
+{
+  typedef T* type;
+};
+
+template<typename Ptr>
+  struct pointer_raw_pointer_impl<Ptr, typename enable_if<has_raw_pointer<Ptr>::value>::type>
+{
+  typedef typename Ptr::raw_pointer type;
+};
+
+} // end pointer_traits_detail
+
+template<typename T>
+  struct pointer_raw_pointer
+    : pointer_traits_detail::pointer_raw_pointer_impl<T>
+{};
+
+namespace pointer_traits_detail
+{
+
+template<typename Void>
+  struct capture_address
+{
+  template<typename T>
+  __host__ __device__
+  capture_address(T &r)
+    : m_addr(&r)
+  {}
+
+  inline __host__ __device__
+  Void *operator&() const
+  {
+    return m_addr;
+  }
+
+  Void *m_addr;
+};
+
+// metafunction to compute the type of pointer_to's parameter below
+template<typename T>
+  struct pointer_to_param
+    : thrust::detail::eval_if<
+        thrust::detail::is_void<T>::value,
+        thrust::detail::identity_<capture_address<T> >,
+        thrust::detail::add_reference<T>
+      >
+{};
+
+}
+
+template<typename Ptr>
+  struct pointer_traits
+{
+  typedef Ptr                                    pointer;
+  typedef typename pointer_element<Ptr>::type    element_type;
+  typedef typename pointer_difference<Ptr>::type difference_type;
+
+  template<typename U>
+    struct rebind 
+  {
+    typedef typename rebind_pointer<Ptr,U>::type other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    // XXX this is supposed to be pointer::pointer_to(&r); (i.e., call a static member function of pointer called pointer_to)
+    //     assume that pointer has a constructor from raw pointer instead
+    
+    return pointer(&r);
+  }
+
+  // thrust additions follow
+  typedef typename pointer_raw_pointer<Ptr>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr.get();
+  }
+};
+
+template<typename T>
+  struct pointer_traits<T*>
+{
+  typedef T*                                    pointer;
+  typedef T                                     element_type;
+  typedef typename pointer_difference<T*>::type difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef typename pointer_raw_pointer<T*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_pointer_convertible
+    : thrust::detail::and_<
+        thrust::detail::is_convertible<
+          typename pointer_element<FromPtr>::type *,
+          typename pointer_element<ToPtr>::type *
+        >,
+        thrust::detail::is_convertible<
+          typename iterator_system<FromPtr>::type,
+          typename iterator_system<ToPtr>::type
+        >
+      >
+{};
+
+// this could be a lot better, but for our purposes, it's probably
+// sufficient just to check if pointer_raw_pointer<T> has meaning
+template<typename T>
+  struct is_thrust_pointer
+    : is_metafunction_defined<pointer_raw_pointer<T> >
+{};
+
+// avoid inspecting traits of the arguments if they aren't known to be pointers
+template<typename FromPtr, typename ToPtr>
+  struct lazy_is_pointer_convertible
+    : thrust::detail::eval_if<
+        is_thrust_pointer<FromPtr>::value && is_thrust_pointer<ToPtr>::value,
+        is_pointer_convertible<FromPtr,ToPtr>,
+        thrust::detail::identity_<thrust::detail::false_type>
+      >
+{};
+
+template<typename FromPtr, typename ToPtr, typename T = void>
+  struct enable_if_pointer_is_convertible
+    : thrust::detail::enable_if<
+        lazy_is_pointer_convertible<FromPtr,ToPtr>::type::value,
+        T
+      >
+{};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/type_traits/result_of.h b/compat/thrust/detail/type_traits/result_of.h
new file mode 100644
index 0000000..e30b4fd
--- /dev/null
+++ b/compat/thrust/detail/type_traits/result_of.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template<typename Signature, typename Enable = void> struct result_of;
+
+// specialization for unary invocations of things which have result_type
+template<typename Functor, typename Arg1>
+  struct result_of<
+    Functor(Arg1),
+    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
+  >
+{
+  typedef typename Functor::result_type type;
+}; // end result_of
+
+// specialization for binary invocations of things which have result_type
+template<typename Functor, typename Arg1, typename Arg2>
+  struct result_of<
+    Functor(Arg1,Arg2),
+    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
+  >
+{
+  typedef typename Functor::result_type type;
+};
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/detail/uninitialized_copy.inl b/compat/thrust/detail/uninitialized_copy.inl
new file mode 100644
index 0000000..a01dca5
--- /dev/null
+++ b/compat/thrust/detail/uninitialized_copy.inl
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_copy.inl
+ *  \brief Inline file for uninitialized_copy.h.
+ */
+
+#include <thrust/uninitialized_copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/uninitialized_copy.h>
+#include <thrust/system/detail/adl/uninitialized_copy.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
+  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  using thrust::system::detail::generic::uninitialized_copy;
+  return uninitialized_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
+} // end uninitialized_copy()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  using thrust::system::detail::generic::uninitialized_copy_n;
+  return uninitialized_copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
+} // end uninitialized_copy_n()
+
+
+template<typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::uninitialized_copy(select_system(system1,system2), first, last, result);
+} // end uninitialized_copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type   System1;
+  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::uninitialized_copy_n(select_system(system1,system2), first, n, result);
+} // end uninitialized_copy_n()
+
+
+} // end thrust
+
+
diff --git a/compat/thrust/detail/uninitialized_fill.inl b/compat/thrust/detail/uninitialized_fill.inl
new file mode 100644
index 0000000..3545de5
--- /dev/null
+++ b/compat/thrust/detail/uninitialized_fill.inl
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_fill.inl
+ *  \brief Inline file for uninitialized_fill.h.
+ */
+
+#include <thrust/uninitialized_fill.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/uninitialized_fill.h>
+#include <thrust/system/detail/adl/uninitialized_fill.h>
+
+namespace thrust
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  using thrust::system::detail::generic::uninitialized_fill;
+  return uninitialized_fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, x);
+} // end uninitialized_fill()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
+  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  using thrust::system::detail::generic::uninitialized_fill_n;
+  return uninitialized_fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, x);
+} // end uninitialized_fill_n()
+
+
+template<typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  thrust::uninitialized_fill(select_system(system), first, last, x);
+} // end uninitialized_fill()
+
+
+template<typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::uninitialized_fill_n(select_system(system), first, n, x);
+} // end uninitialized_fill_n()
+
+
+} // end thrust
+
diff --git a/compat/thrust/detail/unique.inl b/compat/thrust/detail/unique.inl
new file mode 100644
index 0000000..e90187d
--- /dev/null
+++ b/compat/thrust/detail/unique.inl
@@ -0,0 +1,320 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.inl
+ *  \brief Inline file for unique.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/unique.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/system/detail/adl/unique.h>
+#include <thrust/system/detail/adl/unique_by_key.h>
+
+namespace thrust
+{
+
+
+template <typename DerivedPolicy,
+          typename ForwardIterator>
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last)
+{
+  using thrust::system::detail::generic::unique;
+  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end unique()
+
+
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique;
+  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, binary_pred);
+} // end unique()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator>
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output)
+{
+  using thrust::system::detail::generic::unique_copy;
+  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output);
+} // end unique_copy()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryPredicate>
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output,
+                           BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_copy;
+  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output, binary_pred);
+} // end unique_copy()
+
+
+template <typename DerivedPolicy,
+          typename ForwardIterator1,
+          typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first)
+{
+  using thrust::system::detail::generic::unique_by_key;
+  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
+} // end unique_by_key()
+
+
+template <typename DerivedPolicy,
+          typename ForwardIterator1,
+          typename ForwardIterator2,
+          typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first,
+                BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_by_key;
+  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, binary_pred);
+} // end unique_by_key()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_output,
+                     OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::unique_by_key_copy;
+  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_output,
+                     OutputIterator2 values_output,
+                     BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_by_key_copy;
+  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end unique_by_key_copy()
+
+
+template<typename ForwardIterator>
+  ForwardIterator unique(ForwardIterator first,
+                         ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique(select_system(system), first, last);
+} // end unique()
+
+
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique(select_system(system), first, last, binary_pred);
+} // end unique()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator unique_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_copy(select_system(system1,system2), first, last, output);
+} // end unique_copy()
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator>::type  System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_copy(select_system(system1,system2), first, last, output, binary_pred);
+} // end unique_copy()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
+} // end unique_by_key()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
+  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first, binary_pred);
+} // end unique_by_key()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
+  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
+
+  System1 system1;
+  System2 system2;
+  System3 system3;
+  System4 system4;
+
+  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/use_default.h b/compat/thrust/detail/use_default.h
new file mode 100644
index 0000000..c6eb66e
--- /dev/null
+++ b/compat/thrust/detail/use_default.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+struct use_default {};
+
+} // end thrust
+
diff --git a/compat/thrust/detail/util/align.h b/compat/thrust/detail/util/align.h
new file mode 100644
index 0000000..10f107a
--- /dev/null
+++ b/compat/thrust/detail/util/align.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+
+// functions to handle memory alignment
+
+namespace thrust
+{
+namespace detail
+{
+namespace util
+{
+
+template <typename T>
+T * align_up(T * ptr, detail::uintptr_t bytes)
+{
+    return (T *) ( bytes * (((detail::uintptr_t) ptr + (bytes - 1)) / bytes) );
+}
+
+template <typename T>
+T * align_down(T * ptr, detail::uintptr_t bytes)
+{
+    return (T *) ( bytes * (detail::uintptr_t(ptr) / bytes) );
+}
+
+template <typename T>
+bool is_aligned(T * ptr, detail::uintptr_t bytes = sizeof(T))
+{
+    return detail::uintptr_t(ptr) % bytes == 0;
+}
+
+} // end namespace util
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/util/blocking.h b/compat/thrust/detail/util/blocking.h
new file mode 100644
index 0000000..3bb78a6
--- /dev/null
+++ b/compat/thrust/detail/util/blocking.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+//functions to support blocking
+
+namespace thrust
+{
+
+namespace detail
+{
+
+namespace util
+{
+
+// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
+template<typename L, typename R>
+  inline __host__ __device__ L divide_ri(const L x, const R y)
+{
+    return (x + (y - 1)) / y;
+}
+
+// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
+template<typename L, typename R>
+  inline __host__ __device__ L divide_rz(const L x, const R y)
+{
+    return x / y;
+}
+
+// round x towards infinity to the next multiple of y
+template<typename L, typename R>
+  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
+
+// round x towards zero to the next multiple of y
+template<typename L, typename R>
+  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
+
+} // end namespace util
+
+} // end namespace detail
+
+} // end namespace thrust
+
diff --git a/compat/thrust/detail/vector_base.h b/compat/thrust/detail/vector_base.h
new file mode 100644
index 0000000..6974eab
--- /dev/null
+++ b/compat/thrust/detail/vector_base.h
@@ -0,0 +1,534 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file vector_base.h
+ *  \brief Defines the interface to a base class for
+ *         host_vector & device_vector.
+ */
+
+#pragma once
+
+#include <thrust/iterator/detail/normal_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/contiguous_storage.h>
+#include <vector>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T, typename Alloc>
+  class vector_base
+{
+  private:
+    typedef thrust::detail::contiguous_storage<T,Alloc> storage_type;
+
+  public:
+    // typedefs
+    typedef typename storage_type::value_type      value_type;
+    typedef typename storage_type::pointer         pointer;
+    typedef typename storage_type::const_pointer   const_pointer;
+    typedef typename storage_type::reference       reference;
+    typedef typename storage_type::const_reference const_reference;
+    typedef typename storage_type::size_type       size_type;
+    typedef typename storage_type::difference_type difference_type;
+    typedef typename storage_type::allocator_type  allocator_type;
+
+    typedef typename storage_type::iterator        iterator;
+    typedef typename storage_type::const_iterator  const_iterator;
+
+    typedef thrust::reverse_iterator<iterator>       reverse_iterator;
+    typedef thrust::reverse_iterator<const_iterator> const_reverse_iterator;
+
+    /*! This constructor creates an empty vector_base.
+     */
+    vector_base(void);
+
+    /*! This constructor creates a vector_base with default-constructed
+     *  elements.
+     *  \param n The number of elements to create.
+     */
+    explicit vector_base(size_type n);
+
+    /*! This constructor creates a vector_base with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    explicit vector_base(size_type n, const value_type &value);
+
+    /*! Copy constructor copies from an exemplar vector_base.
+     *  \param v The vector_base to copy.
+     */
+    vector_base(const vector_base &v);
+
+    /*! assign operator makes a copy of an exemplar vector_base.
+     *  \param v The vector_base to copy.
+     */
+    vector_base &operator=(const vector_base &v);
+
+    /*! Copy constructor copies from an exemplar vector_base with different
+     *  type.
+     *  \param v The vector_base to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base(const vector_base<OtherT, OtherAlloc> &v);
+
+    /*! assign operator makes a copy of an exemplar vector_base with different
+     *  type.
+     *  \param v The vector_base to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base &operator=(const vector_base<OtherT,OtherAlloc> &v);
+
+    /*! Copy constructor copies from an exemplar std::vector.
+     *  \param v The std::vector to copy.
+     *  XXX TODO: Make this method redundant with a properly templatized constructor.
+     *            We would like to copy from a vector whose element type is anything
+     *            assignable to value_type.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base(const std::vector<OtherT, OtherAlloc> &v);
+
+    /*! assign operator makes a copy of an exemplar std::vector.
+     *  \param v The vector to copy.
+     *  XXX TODO: Templatize this assign on the type of the vector to copy from.
+     *            We would like to copy from a vector whose element type is anything
+     *            assignable to value_type.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    vector_base &operator=(const std::vector<OtherT,OtherAlloc> &v);
+
+    /*! This constructor builds a vector_base from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector_base(InputIterator first, InputIterator last);
+
+    /*! The destructor erases the elements.
+     */
+    ~vector_base(void);
+
+    /*! \brief Resizes this vector_base to the specified number of elements.
+     *  \param new_size Number of elements this vector_base should contain.
+     *  \throw std::length_error If n exceeds max_size9).
+     *
+     *  This method will resize this vector_base to the specified number of
+     *  elements. If the number is smaller than this vector_base's current
+     *  size this vector_base is truncated, otherwise this vector_base is
+     *  extended and new elements are default constructed.
+     */
+    void resize(size_type new_size);
+
+    /*! \brief Resizes this vector_base to the specified number of elements.
+     *  \param new_size Number of elements this vector_base should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector_base to the specified number of
+     *  elements.  If the number is smaller than this vector_base's current
+     *  size this vector_base is truncated, otherwise this vector_base is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x);
+
+    /*! Returns the number of elements in this vector_base.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector_base.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector_base.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector_base to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector_base.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector_base.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector_base.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector_base's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector_base's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector_base.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector_base's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector_base.
+     *  \return The first element of this vector_base.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector_base.
+     *  \return The first element of this vector_base.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector_base.
+     *  \return The last element of this vector_base.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector_base.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector_base's first element.
+     *  \return A pointer to the first element of this vector_base.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector_base's first element.
+     *  \return a const_pointer to the first element of this vector_base.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector_base to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector_base.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector_base, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this vector_base with another vector_base.
+     *  \param v The vector_base with which to swap.
+     */
+    void swap(vector_base &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector_base.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector_base.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector_base.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector_base.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector_base.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector_base a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+
+  protected:
+    // Our storage
+    storage_type m_storage;
+
+    // The size of this vector_base, in number of elements.
+    size_type m_size;
+
+  private:
+    // these methods resolve the ambiguity of the constructor template of form (Iterator, Iterator)
+    template<typename IteratorOrIntegralType>
+      void init_dispatch(IteratorOrIntegralType begin, IteratorOrIntegralType end, false_type); 
+
+    template<typename IteratorOrIntegralType>
+      void init_dispatch(IteratorOrIntegralType n, IteratorOrIntegralType value, true_type); 
+
+    template<typename InputIterator>
+      void range_init(InputIterator first, InputIterator last);
+
+    template<typename InputIterator>
+      void range_init(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
+
+    template<typename ForwardIterator>
+      void range_init(ForwardIterator first, ForwardIterator last, thrust::random_access_traversal_tag);
+
+    void default_init(size_type n);
+
+    void fill_init(size_type n, const T &x);
+
+    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
+    template<typename InputIteratorOrIntegralType>
+      void insert_dispatch(iterator position, InputIteratorOrIntegralType first, InputIteratorOrIntegralType last, false_type);
+
+    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
+    template<typename InputIteratorOrIntegralType>
+      void insert_dispatch(iterator position, InputIteratorOrIntegralType n, InputIteratorOrIntegralType x, true_type);
+
+    // this method appends n default-constructed elements at the end
+    void append(size_type n);
+
+    // this method performs insertion from a fill value
+    void fill_insert(iterator position, size_type n, const T &x);
+
+    // this method performs insertion from a range
+    template<typename InputIterator>
+      void copy_insert(iterator position, InputIterator first, InputIterator last);
+
+    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
+    template<typename InputIterator>
+      void assign_dispatch(InputIterator first, InputIterator last, false_type);
+
+    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
+    template<typename Integral>
+      void assign_dispatch(Integral n, Integral x, true_type);
+
+    // this method performs assignment from a range
+    template<typename InputIterator>
+      void range_assign(InputIterator first, InputIterator last);
+
+    // this method performs assignment from a range of RandomAccessIterators
+    template<typename RandomAccessIterator>
+      void range_assign(RandomAccessIterator first, RandomAccessIterator last, thrust::random_access_traversal_tag);
+
+    // this method performs assignment from a range of InputIterators
+    template<typename InputIterator>
+      void range_assign(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
+
+    // this method performs assignment from a fill value
+    void fill_assign(size_type n, const T &x);
+
+    // this method allocates new storage and construct copies the given range
+    template<typename ForwardIterator>
+    void allocate_and_copy(size_type requested_size,
+                           ForwardIterator first, ForwardIterator last,
+                           storage_type &new_storage);
+}; // end vector_base
+
+} // end detail
+
+/*! This function assigns the contents of vector a to vector b and the
+ *  contents of vector b to vector a.
+ *
+ *  \param a The first vector of interest. After completion, the contents
+ *           of b will be returned here.
+ *  \param b The second vector of interest. After completion, the contents
+ *           of a will be returned here.
+ */
+template<typename T, typename Alloc>
+  void swap(detail::vector_base<T,Alloc> &a,
+            detail::vector_base<T,Alloc> &b);
+
+
+/*! This operator allows comparison between two vectors.
+ *  \param lhs The first \p vector to compare.
+ *  \param rhs The second \p vector to compare.
+ *  \return \c true if and only if each corresponding element in either
+ *          \p vector equals the other; \c false, otherwise.
+ */
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs);
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+
+/*! This operator allows comparison between two vectors.
+ *  \param lhs The first \p vector to compare.
+ *  \param rhs The second \p vector to compare.
+ *  \return \c false if and only if each corresponding element in either
+ *          \p vector equals the other; \c true, otherwise.
+ */
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs);
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs);
+
+} // end thrust
+
+#include <thrust/detail/vector_base.inl>
+
diff --git a/compat/thrust/detail/vector_base.inl b/compat/thrust/detail/vector_base.inl
new file mode 100644
index 0000000..24e6466
--- /dev/null
+++ b/compat/thrust/detail/vector_base.inl
@@ -0,0 +1,1203 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file vector_base.inl
+ *  \brief Inline file for vector_base.h.
+ */
+
+#include <thrust/detail/vector_base.h>
+#include <thrust/detail/copy.h>
+#include <thrust/detail/overlapped_copy.h>
+#include <thrust/equal.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+
+#include <stdexcept>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(void)
+      :m_storage(),
+       m_size(0)
+{
+  ;
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n)
+      :m_storage(),
+       m_size(0)
+{
+  default_init(n);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const value_type &value)
+      :m_storage(),
+       m_size(0)
+{
+  fill_init(n,value);
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const vector_base &v)
+      :m_storage(),
+       m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc> &
+    vector_base<T,Alloc>
+      ::operator=(const vector_base &v)
+{
+  if(this != &v)
+  {
+    assign(v.begin(), v.end());
+  } // end if
+
+  return *this;
+} // end vector_base::operator=()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc>
+      ::vector_base(const vector_base<OtherT,OtherAlloc> &v)
+        :m_storage(),
+         m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(const vector_base<OtherT,OtherAlloc> &v)
+{
+  assign(v.begin(), v.end());
+
+  return *this;
+} // end vector_base::operator=()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc>
+      ::vector_base(const std::vector<OtherT,OtherAlloc> &v)
+        :m_storage(),
+         m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename OtherT, typename OtherAlloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(const std::vector<OtherT,OtherAlloc> &v)
+{
+  assign(v.begin(), v.end());
+
+  return *this;
+} // end vector_base::operator=()
+
+template<typename T, typename Alloc>
+  template<typename IteratorOrIntegralType>
+    void vector_base<T,Alloc>
+      ::init_dispatch(IteratorOrIntegralType n,
+                      IteratorOrIntegralType value,
+                      true_type)
+{
+  fill_init(n,value);
+} // end vector_base::init_dispatch()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::default_init(size_type n)
+{
+  if(n > 0)
+  {
+    m_storage.allocate(n);
+    m_size = n;
+
+    m_storage.default_construct_n(begin(), size());
+  } // end if
+} // end vector_base::default_init()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_init(size_type n, const T &x)
+{
+  if(n > 0)
+  {
+    m_storage.allocate(n);
+    m_size = n;
+
+    m_storage.uninitialized_fill_n(begin(), size(), x);
+  } // end if
+} // end vector_base::fill_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::init_dispatch(InputIterator first,
+                      InputIterator last,
+                      false_type)
+{
+  range_init(first, last);
+} // end vector_base::init_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_init(InputIterator first,
+                   InputIterator last)
+{
+  range_init(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_init(InputIterator first,
+                   InputIterator last,
+                   thrust::incrementable_traversal_tag)
+{
+  for(; first != last; ++first)
+    push_back(*first);
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::range_init(ForwardIterator first,
+                   ForwardIterator last,
+                   thrust::random_access_traversal_tag)
+{
+  size_type new_size = thrust::distance(first, last);
+
+  allocate_and_copy(new_size, first, last, m_storage);
+  m_size    = new_size;
+} // end vector_base::range_init()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    vector_base<T,Alloc>
+      ::vector_base(InputIterator first,
+                    InputIterator last)
+        :m_storage(),
+         m_size(0)
+{
+  // check the type of InputIterator: if it's an integral type,
+  // we need to interpret this call as (size_type, value_type)
+  typedef thrust::detail::is_integral<InputIterator> Integer;
+
+  init_dispatch(first, last, Integer());
+} // end vector_basee::vector_base()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::resize(size_type new_size)
+{
+  if(new_size < size())
+  {
+    iterator new_end = begin();
+    thrust::advance(new_end, new_size);
+    erase(new_end, end());
+  } // end if
+  else
+  {
+    append(new_size - size());
+  } // end else
+} // end vector_base::resize()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::resize(size_type new_size, const value_type &x)
+{
+  if(new_size < size())
+  {
+    iterator new_end = begin();
+    thrust::advance(new_end, new_size);
+    erase(new_end, end());
+  } // end if
+  else
+  {
+    insert(end(), new_size - size(), x);
+  } // end else
+} // end vector_base::resize()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::size(void) const
+{
+  return m_size;
+} // end vector_base::size()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::max_size(void) const
+{
+  return m_storage.max_size();
+} // end vector_base::max_size()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::reserve(size_type n)
+{
+  if(n > capacity())
+  {
+    allocate_and_copy(n, begin(), end(), m_storage);
+  } // end if
+} // end vector_base::reserve()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::size_type
+    vector_base<T,Alloc>
+      ::capacity(void) const
+{
+  return m_storage.size();
+} // end vector_base::capacity()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::shrink_to_fit(void)
+{
+  // use the swap trick
+  vector_base(*this).swap(*this);
+} // end vector_base::shrink_to_fit()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::operator[](const size_type n)
+{
+  return m_storage[n];
+} // end vector_base::operator[]
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference 
+    vector_base<T,Alloc>
+      ::operator[](const size_type n) const
+{
+  return m_storage[n];
+} // end vector_base::operator[]
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::begin(void)
+{
+  return m_storage.begin();
+} // end vector_base::begin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::begin(void) const
+{
+  return m_storage.begin();
+} // end vector_base::begin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::cbegin(void) const
+{
+  return begin();
+} // end vector_base::cbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reverse_iterator
+    vector_base<T,Alloc>
+      ::rbegin(void)
+{
+  return reverse_iterator(end());
+} // end vector_base::rbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::rbegin(void) const
+{
+  return const_reverse_iterator(end());
+} // end vector_base::rbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::crbegin(void) const
+{
+  return rbegin();
+} // end vector_base::crbegin()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::end(void)
+{
+  iterator result = begin();
+  thrust::advance(result, size());
+  return result;
+} // end vector_base::end()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::end(void) const
+{
+  const_iterator result = begin();
+  thrust::advance(result, size());
+  return result;
+} // end vector_base::end()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_iterator
+    vector_base<T,Alloc>
+      ::cend(void) const
+{
+  return end();
+} // end vector_base::cend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reverse_iterator
+    vector_base<T,Alloc>
+      ::rend(void)
+{
+  return reverse_iterator(begin());
+} // end vector_base::rend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::rend(void) const
+{
+  return const_reverse_iterator(begin());
+} // end vector_base::rend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reverse_iterator
+    vector_base<T,Alloc>
+      ::crend(void) const
+{
+  return rend();
+} // end vector_base::crend()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference
+    vector_base<T,Alloc>
+      ::front(void) const
+{
+  return *begin();
+} // end vector_base::front()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::front(void)
+{
+  return *begin();
+} // end vector_base::front()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_reference
+    vector_base<T,Alloc>
+      ::back(void) const
+{
+  const_iterator ptr_to_back = end();
+  --ptr_to_back;
+  return *ptr_to_back;
+} // end vector_base::vector_base
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::reference
+    vector_base<T,Alloc>
+      ::back(void)
+{
+  iterator ptr_to_back = end();
+  --ptr_to_back;
+  return *ptr_to_back;
+} // end vector_base::vector_base
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::pointer
+    vector_base<T,Alloc>
+      ::data(void)
+{
+  return &front();
+} // end vector_base::data()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::const_pointer
+    vector_base<T,Alloc>
+      ::data(void) const
+{
+  return &front();
+} // end vector_base::data()
+
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::~vector_base(void)
+{
+  // destroy every living thing
+  m_storage.destroy(begin(),end());
+} // end vector_base::~vector_base()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::clear(void)
+{
+  resize(0);
+} // end vector_base::~vector_dev()
+
+template<typename T, typename Alloc>
+  bool vector_base<T,Alloc>
+    ::empty(void) const
+{
+  return size() == 0;
+} // end vector_base::empty();
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::push_back(const value_type &x)
+{
+  insert(end(), x);
+} // end vector_base::push_back()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::pop_back(void)
+{
+  iterator e = end();
+  iterator ptr_to_back = e;
+  --ptr_to_back;
+  m_storage.destroy(ptr_to_back, e);
+  --m_size;
+} // end vector_base::pop_back()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
+    ::erase(iterator pos)
+{
+  iterator end = pos;
+  ++end;
+  return erase(pos,end);
+} // end vector_base::erase()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
+    ::erase(iterator first, iterator last)
+{
+  // overlap copy the range [last,end()) to first
+  // XXX this copy only potentially overlaps
+  iterator i = thrust::detail::overlapped_copy(last, end(), first);
+
+  // destroy everything after i
+  m_storage.destroy(i, end());
+
+  // modify our size
+  m_size -= (last - first);
+
+  // return an iterator pointing to the position of the first element
+  // following the erased range
+  return first;
+} // end vector_base::erase()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::swap(vector_base &v)
+{
+  thrust::swap(m_storage,  v.m_storage);
+  thrust::swap(m_size,     v.m_size);
+} // end vector_base::swap()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::assign(size_type n, const T &x)
+{
+  fill_assign(n, x);
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::assign(InputIterator first, InputIterator last)
+{
+  // we could have received assign(n, x), so disambiguate on the
+  // type of InputIterator
+  typedef typename thrust::detail::is_integral<InputIterator> integral;
+
+  assign_dispatch(first, last, integral());
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::allocator_type
+    vector_base<T,Alloc>
+      ::get_allocator(void) const
+{
+  return m_storage.get_allocator();
+} // end vector_base::get_allocator()
+
+template<typename T, typename Alloc>
+  typename vector_base<T,Alloc>::iterator
+    vector_base<T,Alloc>
+      ::insert(iterator position, const T &x)
+{
+  // find the index of the insertion
+  size_type index = thrust::distance(begin(), position);
+
+  // make the insertion
+  insert(position, 1, x);
+
+  // return an iterator pointing back to position
+  iterator result = begin();
+  thrust::advance(result, index);
+  return result;
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::insert(iterator position, size_type n, const T &x)
+{
+  fill_insert(position, n, x);
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::insert(iterator position, InputIterator first, InputIterator last)
+{
+  // we could have received insert(position, n, x), so disambiguate on the
+  // type of InputIterator
+  typedef typename thrust::detail::is_integral<InputIterator> integral;
+
+  insert_dispatch(position, first, last, integral());
+} // end vector_base::insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::assign_dispatch(InputIterator first, InputIterator last, false_type)
+{
+  range_assign(first, last);
+} // end vector_base::assign_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename Integral>
+    void vector_base<T,Alloc>
+      ::assign_dispatch(Integral n, Integral x, true_type)
+{
+  fill_assign(n, x);
+} // end vector_base::assign_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::insert_dispatch(iterator position, InputIterator first, InputIterator last, false_type)
+{
+  copy_insert(position, first, last);
+} // end vector_base::insert_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename Integral>
+    void vector_base<T,Alloc>
+      ::insert_dispatch(iterator position, Integral n, Integral x, true_type)
+{
+  fill_insert(position, n, x);
+} // end vector_base::insert_dispatch()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::copy_insert(iterator position,
+                    ForwardIterator first,
+                    ForwardIterator last)
+{
+  if(first != last)
+  {
+    // how many new elements will we create?
+    const size_type num_new_elements = thrust::distance(first, last);
+    if(capacity() - size() >= num_new_elements)
+    {
+      // we've got room for all of them
+      // how many existing elements will we displace?
+      const size_type num_displaced_elements = end() - position;
+      iterator old_end = end();
+
+      if(num_displaced_elements > num_new_elements)
+      {
+        // construct copy n displaced elements to new elements
+        // following the insertion
+        m_storage.uninitialized_copy(end() - num_new_elements, end(), end());
+
+        // extend the size
+        m_size += num_new_elements;
+
+        // copy num_displaced_elements - num_new_elements elements to existing elements
+        // this copy overlaps
+        const size_type copy_length = (old_end - num_new_elements) - position;
+        thrust::detail::overlapped_copy(position, old_end - num_new_elements, old_end - copy_length);
+
+        // finally, copy the range to the insertion point
+        thrust::copy(first, last, position);
+      } // end if
+      else
+      {
+        ForwardIterator mid = first;
+        thrust::advance(mid, num_displaced_elements);
+
+        // construct copy new elements at the end of the vector
+        m_storage.uninitialized_copy(mid, last, end());
+
+        // extend the size
+        m_size += num_new_elements - num_displaced_elements;
+
+        // construct copy the displaced elements
+        m_storage.uninitialized_copy(position, old_end, end());
+
+        // extend the size
+        m_size += num_displaced_elements;
+
+        // copy to elements which already existed
+        thrust::copy(first, mid, position);
+      } // end else
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, num_new_elements);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      if(new_capacity > max_size())
+      {
+        throw std::length_error("insert(): insertion exceeds max_size().");
+      } // end if
+
+      storage_type new_storage(new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy elements before the insertion to the beginning of the newly
+        // allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
+
+        // construct copy elements to insert
+        new_end = m_storage.uninitialized_copy(first, last, new_end);
+
+        // construct copy displaced elements from the old storage to the new storage
+        // remember [position, end()) refers to the old storage
+        new_end = m_storage.uninitialized_copy(position, end(), new_end);
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage 
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size = old_size + num_new_elements;
+    } // end else
+  } // end if
+} // end vector_base::copy_insert()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::append(size_type n)
+{
+  if(n != 0)
+  {
+    if(capacity() - size() >= n)
+    {
+      // we've got room for all of them
+
+      // default construct new elements at the end of the vector
+      m_storage.default_construct_n(end(), n);
+
+      // extend the size
+      m_size += n;
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      // create new storage
+      storage_type new_storage(new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy all elements into the newly allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
+
+        // construct new elements to insert
+        m_storage.default_construct_n(new_end, n);
+        new_end += n;
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage 
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size    = old_size + n;
+    } // end else
+  } // end if
+} // end vector_base::append()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_insert(iterator position, size_type n, const T &x)
+{
+  if(n != 0)
+  {
+    if(capacity() - size() >= n)
+    {
+      // we've got room for all of them
+      // how many existing elements will we displace?
+      const size_type num_displaced_elements = end() - position;
+      iterator old_end = end();
+
+      if(num_displaced_elements > n)
+      {
+        // construct copy n displaced elements to new elements
+        // following the insertion
+        m_storage.uninitialized_copy(end() - n, end(), end());
+
+        // extend the size
+        m_size += n;
+
+        // copy num_displaced_elements - n elements to existing elements
+        // this copy overlaps
+        const size_type copy_length = (old_end - n) - position;
+        thrust::detail::overlapped_copy(position, old_end - n, old_end - copy_length);
+
+        // finally, fill the range to the insertion point
+        thrust::fill_n(position, n, x);
+      } // end if
+      else
+      {
+        // construct new elements at the end of the vector
+        m_storage.uninitialized_fill_n(end(), n - num_displaced_elements, x);
+
+        // extend the size
+        m_size += n - num_displaced_elements;
+
+        // construct copy the displaced elements
+        m_storage.uninitialized_copy(position, old_end, end());
+
+        // extend the size
+        m_size += num_displaced_elements;
+
+        // fill to elements which already existed
+        thrust::fill(position, old_end, x);
+      } // end else
+    } // end if
+    else
+    {
+      const size_type old_size = size();
+
+      // compute the new capacity after the allocation
+      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
+
+      // allocate exponentially larger new storage
+      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
+
+      // do not exceed maximum storage
+      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+      if(new_capacity > max_size())
+      {
+        throw std::length_error("insert(): insertion exceeds max_size().");
+      } // end if
+
+      storage_type new_storage(new_capacity);
+
+      // record how many constructors we invoke in the try block below
+      iterator new_end = new_storage.begin();
+
+      try
+      {
+        // construct copy elements before the insertion to the beginning of the newly
+        // allocated storage
+        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
+
+        // construct new elements to insert
+        m_storage.uninitialized_fill_n(new_end, n, x);
+        new_end += n;
+
+        // construct copy displaced elements from the old storage to the new storage
+        // remember [position, end()) refers to the old storage
+        new_end = m_storage.uninitialized_copy(position, end(), new_end);
+      } // end try
+      catch(...)
+      {
+        // something went wrong, so destroy & deallocate the new storage 
+        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.deallocate();
+
+        // rethrow
+        throw;
+      } // end catch
+
+      // call destructors on the elements in the old storage
+      m_storage.destroy(begin(), end());
+
+      // record the vector's new state
+      m_storage.swap(new_storage);
+      m_size    = old_size + n;
+    } // end else
+  } // end if
+} // end vector_base::fill_insert()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(InputIterator first,
+                     InputIterator last)
+{
+  // dispatch on traversal
+  range_assign(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end range_assign()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(InputIterator first,
+                     InputIterator last,
+                     thrust::incrementable_traversal_tag)
+{
+  iterator current(begin());
+
+  // assign to elements which already exist
+  for(; first != last && current != end(); ++current, ++first)
+  {
+    *current = *first;
+  } // end for
+  
+  // either just the input was exhausted or both
+  // the input and vector elements were exhausted
+  if(first == last)
+  {
+    // if we exhausted the input, erase leftover elements
+    erase(current, end());
+  } // end if
+  else
+  {
+    // insert the rest of the input at the end of the vector
+    insert(end(), first, last);
+  } // end else
+} // end vector_base::range_assign()
+
+template<typename T, typename Alloc>
+  template<typename RandomAccessIterator>
+    void vector_base<T,Alloc>
+      ::range_assign(RandomAccessIterator first,
+                     RandomAccessIterator last,
+                     thrust::random_access_traversal_tag)
+{
+  const size_type n = thrust::distance(first, last);
+
+  if(n > capacity())
+  {
+    storage_type new_storage;
+    allocate_and_copy(n, first, last, new_storage);
+
+    // call destructors on the elements in the old storage
+    m_storage.destroy(begin(), end());
+
+    // record the vector's new state
+    m_storage.swap(new_storage);
+    m_size = n;
+  } // end if
+  else if(size() >= n)
+  {
+    // we can already accomodate the new range
+    iterator new_end = thrust::copy(first, last, begin());
+
+    // destroy the elements we don't need
+    m_storage.destroy(new_end, end());
+
+    // update size
+    m_size = n;
+  } // end else if
+  else
+  {
+    // range fits inside allocated storage, but some elements
+    // have not been constructed yet
+    
+    // XXX TODO we could possibly implement this with one call
+    // to transform rather than copy + uninitialized_copy
+
+    // copy to elements which already exist
+    RandomAccessIterator mid = first;
+    thrust::advance(mid, size());
+    thrust::copy(first, mid, begin());
+
+    // uninitialize_copy to elements which must be constructed
+    m_storage.uninitialized_copy(mid, last, end());
+
+    // update size
+    m_size = n;
+  } // end else
+} // end vector_base::assign()
+
+template<typename T, typename Alloc>
+  void vector_base<T,Alloc>
+    ::fill_assign(size_type n, const T &x)
+{
+  if(n > capacity())
+  {
+    // XXX we should also include a copy of the allocator:
+    // vector_base<T,Alloc> temp(n, x, get_allocator());
+    vector_base<T,Alloc> temp(n, x);
+    temp.swap(*this);
+  } // end if
+  else if(n > size())
+  {
+    // fill to existing elements
+    thrust::fill(begin(), end(), x);
+
+    // construct uninitialized elements
+    m_storage.uninitialized_fill_n(end(), n - size(), x);
+
+    // adjust size
+    m_size += (n - size());
+  } // end else if
+  else
+  {
+    // fill to existing elements
+    iterator new_end = thrust::fill_n(begin(), n, x);
+
+    // erase the elements after the fill
+    erase(new_end, end());
+  } // end else
+} // end vector_base::fill_assign()
+
+template<typename T, typename Alloc>
+  template<typename ForwardIterator>
+    void vector_base<T,Alloc>
+      ::allocate_and_copy(size_type requested_size,
+                          ForwardIterator first, ForwardIterator last,
+                          storage_type &new_storage)
+{
+  if(requested_size == 0)
+  {
+    new_storage.deallocate();
+    return;
+  } // end if
+
+  // allocate exponentially larger new storage
+  size_type allocated_size = thrust::max<size_type>(requested_size, 2 * capacity());
+
+  // do not exceed maximum storage
+  allocated_size = thrust::min<size_type>(allocated_size, max_size());
+
+  if(requested_size > allocated_size)
+  {
+    throw std::length_error("assignment exceeds max_size().");
+  } // end if
+
+  new_storage.allocate(allocated_size);
+
+  try
+  {
+    // construct the range to the newly allocated storage
+    m_storage.uninitialized_copy(first, last, new_storage.begin());
+  } // end try
+  catch(...)
+  {
+    // something went wrong, so destroy & deallocate the new storage 
+    // XXX seems like this destroys too many elements -- should just be last - first instead of requested_size
+    iterator new_storage_end = new_storage.begin();
+    thrust::advance(new_storage_end, requested_size);
+    m_storage.destroy(new_storage.begin(), new_storage_end);
+    new_storage.deallocate();
+
+    // rethrow
+    throw;
+  } // end catch
+} // end vector_base::allocate_and_copy()
+
+
+} // end detail
+
+template<typename T, typename Alloc>
+  void swap(detail::vector_base<T,Alloc> &a,
+            detail::vector_base<T,Alloc> &b)
+{
+  a.swap(b);
+} // end swap()
+
+
+
+namespace detail
+{
+    
+// iterator tags match
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2,
+                  thrust::detail::true_type)
+{
+  return thrust::equal(first1, last1, first2);
+}
+
+// iterator tags differ
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2,
+                  thrust::detail::false_type)
+{
+  typename thrust::iterator_difference<InputIterator1>::type n = thrust::distance(first1,last1);
+
+  typedef typename thrust::iterator_system<InputIterator1>::type FromSystem1;
+  typedef typename thrust::iterator_system<InputIterator2>::type FromSystem2;
+
+  // bring both ranges to the host system
+  // note that these copies are no-ops if the range is already convertible to the host system
+  FromSystem1 from_system1;
+  FromSystem2 from_system2;
+  thrust::host_system_tag to_system;
+  thrust::detail::move_to_system<InputIterator1, FromSystem1, thrust::host_system_tag> rng1(from_system1, to_system, first1, last1);
+  thrust::detail::move_to_system<InputIterator2, FromSystem2, thrust::host_system_tag> rng2(from_system2, to_system, first2, first2 + n);
+
+  return thrust::equal(rng1.begin(), rng1.end(), rng2.begin());
+}
+
+template <typename InputIterator1, typename InputIterator2>
+bool vector_equal(InputIterator1 first1, InputIterator1 last1,
+                  InputIterator2 first2)
+{
+  typedef typename thrust::iterator_system<InputIterator1>::type system1;
+  typedef typename thrust::iterator_system<InputIterator2>::type system2;
+
+  // dispatch on the sameness of the two systems
+  return vector_equal(first1, last1, first2,
+    thrust::detail::is_same<system1,system2>());
+}
+
+} // end namespace detail
+
+
+
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator==(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return !(lhs == rhs);
+}
+    
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
+                const std::vector<T2,Alloc2>&         rhs)
+{
+    return !(lhs == rhs);
+}
+
+template<typename T1, typename Alloc1,
+         typename T2, typename Alloc2>
+bool operator!=(const std::vector<T1,Alloc1>&         lhs,
+                const detail::vector_base<T2,Alloc2>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+} // end thrust
+
diff --git a/compat/thrust/device_allocator.h b/compat/thrust/device_allocator.h
new file mode 100644
index 0000000..a5462d1
--- /dev/null
+++ b/compat/thrust/device_allocator.h
@@ -0,0 +1,123 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_allocator.h
+ *  \brief An allocator which creates new elements in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_new_allocator.h>
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management_classes Memory Management Classes
+ *  \{
+ */
+
+template<typename T> class device_allocator;
+
+/*! \p device_allocator<void> is a device memory allocator.
+ *  This class is a specialization for \c void.
+ *
+ *  \see device_ptr
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<>
+  class device_allocator<void>
+{
+  public:
+    /*! Type of element allocated, \c void. */
+    typedef void                              value_type;
+
+    /*! Pointer to allocation, \c device_ptr<void>. */
+    typedef device_ptr<void>                  pointer;
+
+    /*! \c const pointer to allocation, \c device_ptr<const void>. */
+    typedef device_ptr<const void>            const_pointer;
+
+    /*! Type of allocation size, \c std::size_t. */
+    typedef std::size_t                       size_type;
+
+    /*! Type of allocation difference, \c pointer::difference_type. */
+    typedef pointer::difference_type difference_type;
+
+    /*! The \p rebind metafunction provides the type of a \p device_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_allocator.
+       */
+      typedef device_allocator<U> other;
+    }; // end rebind
+}; // end device_allocator<void>
+
+/*! \p device_allocator is a device memory allocator.
+ *  This implementation inherits from \p device_new_allocator.
+ *
+ *  \see device_ptr
+ *  \see device_new_allocator
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T>
+  class device_allocator
+    : public device_new_allocator<T>
+{
+  public:
+    /*! The \p rebind metafunction provides the type of a \p device_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_allocator.
+       */
+      typedef device_allocator<U> other;
+    }; // end rebind
+
+    /*! No-argument constructor has no effect.
+     */
+    __host__ __device__
+    inline device_allocator() {}
+
+    /*! Copy constructor has no effect.
+     */
+    __host__ __device__
+    inline device_allocator(device_allocator const&) {}
+
+    /*! Constructor from other \p allocator has no effect.
+     */
+    template<typename U>
+    __host__ __device__
+    inline device_allocator(device_allocator<U> const&) {}
+}; // end device_allocator
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/device_delete.h b/compat/thrust/device_delete.h
new file mode 100644
index 0000000..1df3bb6
--- /dev/null
+++ b/compat/thrust/device_delete.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_delete.h
+ *  \brief Deletes variables in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*! \addtogroup deallocation_functions Deallocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! \p device_delete deletes a \p device_ptr allocated with
+ *  \p device_new.
+ *
+ *  \param ptr The \p device_ptr to delete, assumed to have
+ *         been allocated with \p device_new.
+ *  \param n The number of objects to destroy at \p ptr. Defaults to \c 1
+ *         similar to \p device_new.
+ *
+ *  \see device_ptr
+ *  \see device_new
+ */
+template<typename T>
+  inline void device_delete(thrust::device_ptr<T> ptr,
+                            const size_t n = 1);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_delete.inl>
+
diff --git a/compat/thrust/device_free.h b/compat/thrust/device_free.h
new file mode 100644
index 0000000..a734418
--- /dev/null
+++ b/compat/thrust/device_free.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_free.h
+ *  \brief Deallocates storage allocated by \p device_malloc
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*! \addtogroup deallocation_functions Deallocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! \p device_free deallocates memory allocated by the function \p device_malloc.
+ *
+ *  \param ptr A \p device_ptr pointing to memory to be deallocated.
+ *
+ *  The following code snippet demonstrates how to use \p device_free to
+ *  deallocate memory allocated by \p device_malloc.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some integers with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
+ *
+ *  // manipulate integers
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(int_array);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_malloc
+ */
+inline void device_free(thrust::device_ptr<void> ptr);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_free.inl>
+
diff --git a/compat/thrust/device_malloc.h b/compat/thrust/device_malloc.h
new file mode 100644
index 0000000..a3b0723
--- /dev/null
+++ b/compat/thrust/device_malloc.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc.h
+ *  \brief Allocates storage in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <cstddef> // for std::size_t
+
+namespace thrust
+{
+
+/*! \addtogroup allocation_functions Allocation Functions
+ *  \ingroup memory_management_functions
+ *  \{
+ */
+
+/*! This version of \p device_malloc allocates sequential device storage
+ *  for bytes.
+ *
+ *  \param n The number of bytes to allocate sequentially
+ *           in device memory.
+ *  \return A \p device_ptr to the newly allocated memory.
+ *
+ *  The following code snippet demonstrates how to use \p device_malloc to
+ *  allocate a range of device memory.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some memory with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<void> void_ptr = thrust::device_malloc(N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(void_ptr);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_free
+ */
+inline thrust::device_ptr<void> device_malloc(const std::size_t n);
+
+/*! This version of \p device_malloc allocates sequential device storage for
+ *  new objects of the given type.
+ *
+ *  \param n The number of objects of type T to allocate
+ *           sequentially in device memory.
+ *  \return A \p device_ptr to the newly allocated memory.
+ *
+ *  The following code snippet demonstrates how to use \p device_malloc to
+ *  allocate a range of device memory.
+ *
+ *  \code
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_free.h>
+ *  ...
+ *  // allocate some integers with device_malloc
+ *  const int N = 100;
+ *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
+ *
+ *  // manipulate integers
+ *  ...
+ *
+ *  // deallocate with device_free
+ *  thrust::device_free(int_array);
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_free
+ */
+template<typename T>
+  inline thrust::device_ptr<T> device_malloc(const std::size_t n);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_malloc.inl>
+
diff --git a/compat/thrust/device_malloc_allocator.h b/compat/thrust/device_malloc_allocator.h
new file mode 100644
index 0000000..404a6d2
--- /dev/null
+++ b/compat/thrust/device_malloc_allocator.h
@@ -0,0 +1,174 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_malloc_allocator.h
+ *  \brief An allocator which allocates storage with \p device_malloc
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+// forward declarations to WAR circular #includes
+template<typename> class device_ptr;
+template<typename T> device_ptr<T> device_malloc(const std::size_t n);
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_malloc_allocator is a device memory allocator that employs the
+ *  \p device_malloc function for allocation.
+ *
+ *  \see device_malloc
+ *  \see device_ptr
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T>
+  class device_malloc_allocator
+{
+  public:
+    /*! Type of element allocated, \c T. */
+    typedef T                                 value_type;
+
+    /*! Pointer to allocation, \c device_ptr<T>. */
+    typedef device_ptr<T>                     pointer;
+
+    /*! \c const pointer to allocation, \c device_ptr<const T>. */
+    typedef device_ptr<const T>               const_pointer;
+
+    /*! Reference to allocated element, \c device_reference<T>. */
+    typedef device_reference<T>               reference;
+
+    /*! \c const reference to allocated element, \c device_reference<const T>. */
+    typedef device_reference<const T>         const_reference;
+
+    /*! Type of allocation size, \c std::size_t. */
+    typedef std::size_t                       size_type;
+
+    /*! Type of allocation difference, \c pointer::difference_type. */
+    typedef typename pointer::difference_type difference_type;
+
+    /*! The \p rebind metafunction provides the type of a \p device_malloc_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_malloc_allocator.
+       */
+      typedef device_malloc_allocator<U> other;
+    }; // end rebind
+
+    /*! No-argument constructor has no effect. */
+    __host__ __device__
+    inline device_malloc_allocator() {}
+
+    /*! No-argument destructor has no effect. */
+    __host__ __device__
+    inline ~device_malloc_allocator() {}
+
+    /*! Copy constructor has no effect. */
+    __host__ __device__
+    inline device_malloc_allocator(device_malloc_allocator const&) {}
+
+    /*! Constructor from other \p device_malloc_allocator has no effect. */
+    template<typename U>
+    __host__ __device__
+    inline device_malloc_allocator(device_malloc_allocator<U> const&) {}
+
+    /*! Returns the address of an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+    
+    /*! Returns the address an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! Allocates storage for \p cnt objects.
+     *  \param cnt The number of objects to allocate.
+     *  \return A \p pointer to uninitialized storage for \p cnt objects.
+     *  \note Memory allocated by this function must be deallocated with \p deallocate.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = const_pointer(static_cast<T*>(0)))
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      return pointer(device_malloc<T>(cnt));
+    } // end allocate()
+
+    /*! Deallocates storage for objects allocated with \p allocate.
+     *  \param p A \p pointer to the storage to deallocate.
+     *  \param cnt The size of the previous allocation.
+     *  \note Memory deallocated by this function must previously have been
+     *        allocated with \p allocate.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type cnt)
+    {
+      device_free(p);
+    } // end deallocate()
+
+    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! Compares against another \p device_malloc_allocator for equality.
+     *  \return \c true
+     */
+    __host__ __device__
+    inline bool operator==(device_malloc_allocator const&) { return true; }
+
+    /*! Compares against another \p device_malloc_allocator for inequality.
+     *  \return \c false
+     */
+    __host__ __device__
+    inline bool operator!=(device_malloc_allocator const &a) {return !operator==(a); }
+}; // end device_malloc_allocator
+
+/*! \}
+ */
+
+} // end thrust
+
+
diff --git a/compat/thrust/device_new.h b/compat/thrust/device_new.h
new file mode 100644
index 0000000..001d476
--- /dev/null
+++ b/compat/thrust/device_new.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new.h
+ *  \brief Constructs new elements in device memory
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include this for size_t
+#include <cstddef>
+#include <thrust/device_ptr.h>
+
+namespace thrust
+{
+
+/*!
+ *  \addtogroup allocation_functions Allocation Functions
+ *  \{
+ */
+
+/*! \p device_new implements the placement \c new operator for types
+ *  resident in device memory. \p device_new calls <tt>T</tt>'s null
+ *  constructor on a array of objects in device memory.
+ *  No memory is allocated by this function.
+ *
+ *  \param  p A \p device_ptr to a region of device memory into which
+ *          to construct one or many <tt>T</tt>s.
+ *  \param  n The number of objects to construct at \p p.
+ *  \return p, casted to <tt>T</tt>'s type.
+ *
+ *  \see device_ptr
+ */
+template <typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const size_t n = 1);
+
+/*! \p device_new implements the placement new operator for types
+ *  resident in device memory. \p device_new calls <tt>T</tt>'s copy
+ *  constructor on a array of objects in device memory. No memory is
+ *  allocated by this function.
+ *
+ *  \param  p A \p device_ptr to a region of device memory into which to
+ *          construct one or many <tt>T</tt>s.
+ *  \param exemplar The value from which to copy.
+ *  \param  n The number of objects to construct at \p p.
+ *  \return p, casted to <tt>T</tt>'s type.
+ *
+ *  \see device_ptr
+ *  \see fill
+ */
+template <typename T>
+  device_ptr<T> device_new(device_ptr<void> p,
+                           const T &exemplar,
+                           const size_t n = 1);
+
+/*! \p device_new implements the new operator for types resident in device memory.
+ *  It allocates device memory large enough to hold \p n new objects of type \c T.
+ *
+ *  \param n The number of objects to allocate. Defaults to \c 1.
+ *  \return A \p device_ptr to the newly allocated region of device memory.
+ */
+template <typename T>
+  device_ptr<T> device_new(const size_t n = 1);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_new.inl>
+
diff --git a/compat/thrust/device_new_allocator.h b/compat/thrust/device_new_allocator.h
new file mode 100644
index 0000000..527d1fd
--- /dev/null
+++ b/compat/thrust/device_new_allocator.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_new_allocator.h
+ *  \brief An allocator which allocates storage with \p device_new
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_reference.h>
+#include <thrust/device_new.h>
+#include <thrust/device_delete.h>
+#include <limits>
+#include <stdexcept>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_new_allocator is a device memory allocator that employs the
+ *  \p device_new function for allocation.
+ *
+ *  \see device_new
+ *  \see device_ptr
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T>
+  class device_new_allocator
+{
+  public:
+    /*! Type of element allocated, \c T. */
+    typedef T                                 value_type;
+
+    /*! Pointer to allocation, \c device_ptr<T>. */
+    typedef device_ptr<T>                     pointer;
+
+    /*! \c const pointer to allocation, \c device_ptr<const T>. */
+    typedef device_ptr<const T>               const_pointer;
+
+    /*! Reference to allocated element, \c device_reference<T>. */
+    typedef device_reference<T>               reference;
+
+    /*! \c const reference to allocated element, \c device_reference<const T>. */
+    typedef device_reference<const T>         const_reference;
+
+    /*! Type of allocation size, \c std::size_t. */
+    typedef std::size_t                       size_type;
+
+    /*! Type of allocation difference, \c pointer::difference_type. */
+    typedef typename pointer::difference_type difference_type;
+
+    /*! The \p rebind metafunction provides the type of a \p device_new_allocator
+     *  instantiated with another type.
+     *
+     *  \tparam U The other type to use for instantiation.
+     */
+    template<typename U>
+      struct rebind
+    {
+      /*! The typedef \p other gives the type of the rebound \p device_new_allocator.
+       */
+      typedef device_new_allocator<U> other;
+    }; // end rebind
+
+    /*! No-argument constructor has no effect. */
+    __host__ __device__
+    inline device_new_allocator() {}
+
+    /*! No-argument destructor has no effect. */
+    __host__ __device__
+    inline ~device_new_allocator() {}
+
+    /*! Copy constructor has no effect. */
+    __host__ __device__
+    inline device_new_allocator(device_new_allocator const&) {}
+
+    /*! Constructor from other \p device_malloc_allocator has no effect. */
+    template<typename U>
+    __host__ __device__
+    inline device_new_allocator(device_new_allocator<U> const&) {}
+
+    /*! Returns the address of an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+    
+    /*! Returns the address an allocated object.
+     *  \return <tt>&r</tt>.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! Allocates storage for \p cnt objects.
+     *  \param cnt The number of objects to allocate.
+     *  \return A \p pointer to uninitialized storage for \p cnt objects.
+     *  \note Memory allocated by this function must be deallocated with \p deallocate.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = const_pointer(static_cast<T*>(0)))
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      // use "::operator new" rather than keyword new
+      return pointer(device_new<T>(cnt));
+    } // end allocate()
+
+    /*! Deallocates storage for objects allocated with \p allocate.
+     *  \param p A \p pointer to the storage to deallocate.
+     *  \param cnt The size of the previous allocation.
+     *  \note Memory deallocated by this function must previously have been
+     *        allocated with \p allocate.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type cnt)
+    {
+      // use "::operator delete" rather than keyword delete
+      device_delete(p);
+    } // end deallocate()
+
+    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
+     */
+    __host__ __device__
+    inline size_type max_size() const
+    {
+      return std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
+    } // end max_size()
+
+    /*! Compares against another \p device_malloc_allocator for equality.
+     *  \return \c true
+     */
+    __host__ __device__
+    inline bool operator==(device_new_allocator const&) { return true; }
+
+    /*! Compares against another \p device_malloc_allocator for inequality.
+     *  \return \c false
+     */
+    __host__ __device__
+    inline bool operator!=(device_new_allocator const &a) {return !operator==(a); }
+}; // end device_new_allocator
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/device_ptr.h b/compat/thrust/device_ptr.h
new file mode 100644
index 0000000..dfc7e90
--- /dev/null
+++ b/compat/thrust/device_ptr.h
@@ -0,0 +1,170 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_ptr.h
+ *  \brief A pointer to a variable which resides in the "device" system's memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/memory.h>
+#include <ostream>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+// forward declarations
+template<typename T> class device_reference;
+
+/*! \p device_ptr stores a pointer to an object allocated in device memory. This type
+ *  provides type safety when dispatching standard algorithms on ranges resident in
+ *  device memory.
+ *
+ *  \p device_ptr has pointer semantics: it may be dereferenced safely from the host and
+ *  may be manipulated with pointer arithmetic.
+ *
+ *  \p device_ptr can be created with the functions device_malloc, device_new, or
+ *  device_pointer_cast, or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p device_ptr may be obtained by either its <tt>get</tt>
+ *  method or the \p raw_pointer_cast free function.
+ *
+ *  \note \p device_ptr is not a smart pointer; it is the programmer's responsibility to
+ *  deallocate memory pointed to by \p device_ptr.
+ *
+ *  \see device_malloc
+ *  \see device_new
+ *  \see device_pointer_cast
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class device_ptr
+    : public thrust::pointer<
+               T,
+               thrust::device_system_tag,
+               thrust::device_reference<T>,
+               thrust::device_ptr<T>
+             >
+{
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::device_system_tag,
+      thrust::device_reference<T>,
+      thrust::device_ptr<T>
+    > super_t;
+
+  public:
+    /*! \p device_ptr's null constructor initializes its raw pointer to \c 0.
+     */
+    __host__ __device__
+    device_ptr() : super_t() {}
+
+    /*! \p device_ptr's copy constructor is templated to allow copying to a
+     *  <tt>device_ptr<const T></tt> from a <tt>T *</tt>.
+     *  
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in
+     *         device memory.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit device_ptr(OtherT *ptr) : super_t(ptr) {}
+
+    /*! \p device_ptr's copy constructor allows copying from another device_ptr with related type.
+     *  \param other The \p device_ptr to copy from.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_ptr(const device_ptr<OtherT> &other) : super_t(other) {}
+
+    /*! \p device_ptr's assignment operator allows assigning from another \p device_ptr with related type.
+     *  \param other The other \p device_ptr to copy from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_ptr &operator=(const device_ptr<OtherT> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! This method returns this \p device_ptr's raw pointer.
+     *  \return This \p device_ptr's raw pointer.
+     */
+    __host__ __device__
+    T *get(void) const;
+#endif // end doxygen-only members
+}; // end device_ptr
+
+/*! This operator outputs the value of a \p device_ptr's raw pointer to a \p std::basic_ostream.
+ *
+ *  \param os The std::basic_ostream of interest.
+ *  \param p The device_ptr of interest.
+ *  \return os.
+ */
+template<class E, class T, class Y>
+inline std::basic_ostream<E, T> &operator<<(std::basic_ostream<E, T> &os, const device_ptr<Y> &p);
+
+/*! \}
+ */
+
+
+/*!
+ *  \addtogroup memory_management_functions Memory Management Functions
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_pointer_cast creates a device_ptr from a raw pointer which is presumed to point
+ *  to a location in device memory.
+ *
+ *  \param ptr A raw pointer, presumed to point to a location in device memory.
+ *  \return A device_ptr wrapping ptr.
+ */
+template<typename T>
+__host__ __device__
+inline device_ptr<T> device_pointer_cast(T *ptr);
+
+/*! This version of \p device_pointer_cast creates a copy of a device_ptr from another device_ptr.
+ *  This version is included for symmetry with \p raw_pointer_cast.
+ *
+ *  \param ptr A device_ptr.
+ *  \return A copy of \p ptr.
+ */
+template<typename T>
+__host__ __device__
+inline device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_ptr.inl>
+#include <thrust/detail/raw_pointer_cast.h>
+
diff --git a/compat/thrust/device_reference.h b/compat/thrust/device_reference.h
new file mode 100644
index 0000000..edae2b5
--- /dev/null
+++ b/compat/thrust/device_reference.h
@@ -0,0 +1,969 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_reference.h
+ *  \brief A reference to a variable which resides in the "device" system's memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_ptr.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/reference.h>
+
+namespace thrust
+{
+
+/*! \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p device_reference acts as a reference-like object to an object stored in device memory.
+ *  \p device_reference is not intended to be used directly; rather, this type
+ *  is the result of deferencing a \p device_ptr. Similarly, taking the address of
+ *  a \p device_reference yields a \p device_ptr.
+ *  
+ *  \p device_reference may often be used from host code in place of operations defined on
+ *  its associated \c value_type. For example, when \p device_reference refers to an
+ *  arithmetic type, arithmetic operations on it are legal:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    thrust::device_reference<int> ref_to_thirteen = vec[0];
+ *
+ *    int x = ref_to_thirteen + 1;
+ *
+ *    // x is 14
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Similarly, we can print the value of \c ref_to_thirteen in the above code by using an
+ *  \c iostream:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    thrust::device_reference<int> ref_to_thirteen = vec[0];
+ *
+ *    std::cout << ref_to_thirteen << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Of course, we needn't explicitly create a \p device_reference in the previous
+ *  example, because one is returned by \p device_vector's bracket operator. A more natural
+ *  way to print the value of a \p device_vector element might be:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1, 13);
+ *
+ *    std::cout << vec[0] << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  These kinds of operations should be used sparingly in performance-critical code, because
+ *  they imply a potentially expensive copy between host and device space.
+ *
+ *  Some operations which are possible with regular objects are impossible with their
+ *  corresponding \p device_reference objects due to the requirements of the C++ language. For
+ *  example, because the member access operator cannot be overloaded, member variables and functions
+ *  of a referent object cannot be directly accessed through its \p device_reference.
+ *
+ *  The following code, which generates a compiler error, illustrates:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct foo
+ *  {
+ *    int x;
+ *  };
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<foo> foo_vec(1);
+ *
+ *    thrust::device_reference<foo> foo_ref = foo_vec[0];
+ *
+ *    foo_ref.x = 13; // ERROR: x cannot be accessed through foo_ref
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Instead, a host space copy must be created to access \c foo's \c x member:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct foo
+ *  {
+ *    int x;
+ *  };
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<foo> foo_vec(1);
+ *
+ *    // create a local host-side foo object
+ *    foo host_foo;
+ *    host_foo.x = 13;
+ *
+ *    thrust::device_reference<foo> foo_ref = foo_vec[0];
+ *
+ *    foo_ref = host_foo;
+ *
+ *    // foo_ref's x member is 13
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *  
+ *  Another common case where a \p device_reference cannot directly be used in place of
+ *  its referent object occurs when passing them as parameters to functions like \c printf
+ *  which have varargs parameters. Because varargs parameters must be Plain Old Data, a
+ *  \p device_reference to a POD type requires a cast when passed to \c printf:
+ *
+ *  \code
+ *  #include <stdio.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> vec(1,13);
+ *
+ *    // vec[0] must be cast to int when passing to printf
+ *    printf("%d\n", (int) vec[0]);
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see device_ptr
+ *  \see device_vector
+ */
+template<typename T>
+  class device_reference
+    : public thrust::reference<
+               T,
+               thrust::device_ptr<T>,
+               thrust::device_reference<T>
+             >
+{
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::device_ptr<T>,
+      thrust::device_reference<T>
+    > super_t;
+
+  public:
+    /*! The type of the value referenced by this type of \p device_reference.
+     */
+    typedef typename super_t::value_type value_type;
+
+    /*! The type of the expression <tt>&ref</tt>, where <tt>ref</tt> is a \p device_reference.
+     */
+    typedef typename super_t::pointer    pointer;
+
+    /*! This copy constructor accepts a const reference to another
+     *  \p device_reference. After this \p device_reference is constructed,
+     *  it shall refer to the same object as \p other.
+     *  
+     *  \param other A \p device_reference to copy from.
+     *
+     *  The following code snippet demonstrates the semantics of this
+     *  copy constructor.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_reference<int> ref = v[0];
+     *
+     *  // ref equals the object at v[0]
+     *  assert(ref == v[0]);
+     *
+     *  // the address of ref equals the address of v[0]
+     *  assert(&ref == &v[0]);
+     *
+     *  // modifying v[0] modifies ref
+     *  v[0] = 13;
+     *  assert(ref == 13);
+     *  \endcode
+     *
+     *  \note This constructor is templated primarily to allow initialization of 
+     *  <tt>device_reference<const T></tt> from <tt>device_reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_reference(const device_reference<OtherT> &other,
+                     typename thrust::detail::enable_if_convertible<
+                       typename device_reference<OtherT>::pointer,
+                       pointer
+                     >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! This copy constructor initializes this \p device_reference
+     *  to refer to an object pointed to by the given \p device_ptr. After
+     *  this \p device_reference is constructed, it shall refer to the
+     *  object pointed to by \p ptr.
+     *
+     *  \param ptr A \p device_ptr to copy from.
+     *
+     *  The following code snippet demonstrates the semantic of this
+     *  copy constructor.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals the object pointed to by ptr
+     *  assert(ref == *ptr);
+     *
+     *  // the address of ref equals ptr
+     *  assert(&ref == ptr);
+     *
+     *  // modifying *ptr modifies ref
+     *  *ptr = 13;
+     *  assert(ref == 13);
+     *  \endcode
+     */
+    __host__ __device__
+    explicit device_reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This assignment operator assigns the value of the object referenced by
+     *  the given \p device_reference to the object referenced by this
+     *  \p device_reference.
+     *
+     *  \param other The \p device_reference to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT>
+    __host__ __device__
+    device_reference &operator=(const device_reference<OtherT> &other);
+
+    /*! Assignment operator assigns the value of the given value to the
+     *  value referenced by this \p device_reference.
+     *  
+     *  \param x The value to assign from.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    device_reference &operator=(const value_type &x);
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! Address-of operator returns a \p device_ptr pointing to the object
+     *  referenced by this \p device_reference. It does not return the
+     *  address of this \p device_reference.
+     *
+     *  \return A \p device_ptr pointing to the object this
+     *  \p device_reference references.
+     */
+    __host__ __device__
+    pointer operator&(void) const;
+
+    /*! Conversion operator converts this \p device_reference to T
+     *  by returning a copy of the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return A copy of the object referenced by this \p device_reference.
+     */
+    __host__ __device__
+    operator value_type (void) const;
+
+    /*! swaps the value this \p device_reference references with another.
+     *  \p other The other \p device_reference with which to swap.
+     */
+    __host__ __device__
+    void swap(device_reference &other);
+
+    /*! Prefix increment operator increments the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return <tt>*this</tt>
+     *  
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's prefix increment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // increment ref
+     *  ++ref;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The increment executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator++(void);
+
+    /*! Postfix increment operator copies the object referenced by this
+     *  \p device_reference, increments the object referenced by this
+     *  \p device_reference, and returns the copy.
+     *
+     *  \return A copy of the object referenced by this \p device_reference
+     *          before being incremented.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's postfix increment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // increment ref
+     *  int x = ref++;
+     *
+     *  // x equals 0
+     *  assert(x == 0)
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The increment executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    value_type operator++(int);
+
+    /*! Addition assignment operator add-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the add-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's addition assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // add-assign ref
+     *  ref += 5;
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *  \endcode
+     *
+     *  \note The add-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator+=(const T &rhs);
+
+    /*! Prefix decrement operator decrements the object referenced by this
+     *  \p device_reference.
+     *
+     *  \return <tt>*this</tt>
+     *  
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's prefix decrement operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // decrement ref
+     *  --ref;
+     *
+     *  // ref equals -1
+     *  assert(ref == -1);
+     *
+     *  // the object pointed to by ptr equals -1
+     *  assert(*ptr == -1);
+     *
+     *  // v[0] equals -1
+     *  assert(v[0] == -1);
+     *  \endcode
+     *
+     *  \note The decrement executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator--(void);
+
+    /*! Postfix decrement operator copies the object referenced by this
+     *  \p device_reference, decrements the object referenced by this
+     *  \p device_reference, and returns the copy.
+     *
+     *  \return A copy of the object referenced by this \p device_reference
+     *          before being decremented.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's postfix decrement operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // decrement ref
+     *  int x = ref--;
+     *
+     *  // x equals 0
+     *  assert(x == 0)
+     *
+     *  // ref equals -1
+     *  assert(ref == -1);
+     *
+     *  // the object pointed to by ptr equals -1
+     *  assert(*ptr == -1);
+     *
+     *  // v[0] equals -1
+     *  assert(v[0] == -1);
+     *  \endcode
+     *
+     *  \note The decrement executes as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    value_type operator--(int);
+
+    /*! Subtraction assignment operator subtract-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the subtraction-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's addition assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // subtract-assign ref
+     *  ref -= 5;
+     *
+     *  // ref equals -5
+     *  assert(ref == -5);
+     *
+     *  // the object pointed to by ptr equals -5
+     *  assert(*ptr == -5);
+     *
+     *  // v[0] equals -5
+     *  assert(v[0] == -5);
+     *  \endcode
+     *
+     *  \note The subtract-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator-=(const T &rhs);
+
+    /*! Multiplication assignment operator multiply-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the multiply-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's multiply assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // multiply-assign ref
+     *  ref *= 5;
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *  \endcode
+     *
+     *  \note The multiply-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator*=(const T &rhs);
+
+    /*! Division assignment operator divide-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the divide-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's divide assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,5);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *
+     *  // divide-assign ref
+     *  ref /= 5;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The divide-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator/=(const T &rhs);
+
+    /*! Modulation assignment operator modulus-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the divide-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's divide assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,5);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 5
+     *  assert(ref == 5);
+     *
+     *  // the object pointed to by ptr equals 5
+     *  assert(*ptr == 5);
+     *
+     *  // v[0] equals 5
+     *  assert(v[0] == 5);
+     *
+     *  // modulus-assign ref
+     *  ref %= 5;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The modulus-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator%=(const T &rhs);
+
+    /*! Bitwise left shift assignment operator left shift-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the left shift-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's left shift assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // left shift-assign ref
+     *  ref <<= 1;
+     *
+     *  // ref equals 2
+     *  assert(ref == 2);
+     *
+     *  // the object pointed to by ptr equals 2
+     *  assert(*ptr == 2);
+     *
+     *  // v[0] equals 2
+     *  assert(v[0] == 2);
+     *  \endcode
+     *
+     *  \note The left shift-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator<<=(const T &rhs);
+
+    /*! Bitwise right shift assignment operator right shift-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the right shift-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's right shift assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,2);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 2
+     *  assert(ref == 2);
+     *
+     *  // the object pointed to by ptr equals 2
+     *  assert(*ptr == 2);
+     *
+     *  // v[0] equals 2
+     *  assert(v[0] == 2);
+     *
+     *  // right shift-assign ref
+     *  ref >>= 1;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The right shift-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator>>=(const T &rhs);
+
+    /*! Bitwise AND assignment operator AND-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the AND-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's AND assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // right AND-assign ref
+     *  ref &= 0;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The AND-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator&=(const T &rhs);
+
+    /*! Bitwise OR assignment operator OR-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the OR-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's OR assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,0);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *
+     *  // right OR-assign ref
+     *  ref |= 1;
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *  \endcode
+     *
+     *  \note The OR-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator|=(const T &rhs);
+
+    /*! Bitwise XOR assignment operator XOR-assigns the object referenced by this
+     *  \p device_reference and returns this \p device_reference.
+     *
+     *  \param rhs The right hand side of the XOR-assignment.
+     *  \return <tt>*this</tt>.
+     *
+     *  The following code snippet demonstrates the semantics of
+     *  \p device_reference's XOR assignment operator.
+     *
+     *  \code
+     *  #include <thrust/device_vector.h>
+     *  #include <assert.h>
+     *  ...
+     *  thrust::device_vector<int> v(1,1);
+     *  thrust::device_ptr<int> ptr = &v[0];
+     *  thrust::device_reference<int> ref(ptr);
+     *
+     *  // ref equals 1
+     *  assert(ref == 1);
+     *
+     *  // the object pointed to by ptr equals 1
+     *  assert(*ptr == 1);
+     *
+     *  // v[0] equals 1
+     *  assert(v[0] == 1);
+     *
+     *  // right XOR-assign ref
+     *  ref ^= 1;
+     *
+     *  // ref equals 0
+     *  assert(ref == 0);
+     *
+     *  // the object pointed to by ptr equals 0
+     *  assert(*ptr == 0);
+     *
+     *  // v[0] equals 0
+     *  assert(v[0] == 0);
+     *  \endcode
+     *
+     *  \note The XOR-assignment executes as as if it were executed on the host.
+     *  This may change in a later version.
+     */
+    device_reference &operator^=(const T &rhs);
+#endif // end doxygen-only members
+}; // end device_reference
+
+/*! swaps the value of one \p device_reference with another.
+ *  \p x The first \p device_reference of interest.
+ *  \p y The second \p device_reference of interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(device_reference<T> &x, device_reference<T> &y);
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_reference.inl>
+
diff --git a/compat/thrust/device_vector.h b/compat/thrust/device_vector.h
new file mode 100644
index 0000000..8c9d005
--- /dev/null
+++ b/compat/thrust/device_vector.h
@@ -0,0 +1,418 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_vector.h
+ *  \brief A dynamically-sizable array of elements which reside in the "device" memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Alloc> class host_vector;
+
+/*! \addtogroup container_classes Container Classes
+ *  \addtogroup device_containers Device Containers
+ *  \ingroup container_classes
+ *  \{
+ */
+
+/*! A \p device_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p device_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p device_vector resides in the memory
+ *  space of a parallel device.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector
+ */
+template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
+  class device_vector
+    : public detail::vector_base<T,Alloc>
+{
+  private:
+    typedef detail::vector_base<T,Alloc> Parent;
+
+  public:
+    /*! \cond */
+    typedef typename Parent::size_type  size_type;
+    typedef typename Parent::value_type value_type;
+    /*! \endcond */
+
+    /*! This constructor creates an empty \p device_vector.
+     */
+    __host__
+    device_vector(void)
+      :Parent() {}
+
+    /*! This constructor creates a \p device_vector with the given
+     *  size.
+     *  \param n The number of elements to initially craete.
+     */
+    __host__
+    explicit device_vector(size_type n)
+      :Parent(n) {}
+
+    /*! This constructor creates a \p device_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    __host__
+    explicit device_vector(size_type n, const value_type &value)
+      :Parent(n,value) {}
+
+    /*! Copy constructor copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     */
+    __host__
+    device_vector(const device_vector &v)
+      :Parent(v) {}
+
+    /*! Copy constructor copies from an exemplar \p device_vector with different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __device__
+    device_vector(const device_vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar \p device_vector with different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __device__
+    device_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! Copy constructor copies from an exemplar \c std::vector.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    device_vector(const std::vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    device_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this;}
+
+    /*! Copy constructor copies from an exemplar \p host_vector with possibly different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    device_vector(const host_vector<OtherT,OtherAlloc> &v);
+
+    /*! Assign operator copies from an examplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    device_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! This constructor builds a \p device_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    __host__
+    device_vector(InputIterator first, InputIterator last)
+      :Parent(first,last) {}
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! \brief Resizes this vector to the specified number of elements.
+     *  \param new_size Number of elements this vector should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector to the specified number of
+     *  elements.  If the number is smaller than this vector's current
+     *  size this vector is truncated, otherwise this vector is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x = value_type());
+
+    /*! Returns the number of elements in this vector.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector.
+     *  \return The last element of this vector.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector's first element.
+     *  \return A pointer to the first element of this vector.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector's first element.
+     *  \return a const_pointer to the first element of this vector.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this vector_base with another vector.
+     *  \param v The vector with which to swap.
+     */
+    void swap(device_vector &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+#endif // end doxygen-only members
+}; // end device_vector
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/device_vector.inl>
+
+
diff --git a/compat/thrust/distance.h b/compat/thrust/distance.h
new file mode 100644
index 0000000..67b4194
--- /dev/null
+++ b/compat/thrust/distance.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.h
+ *  \brief Computes the size of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \p distance finds the distance between \p first and \p last, i.e. the
+ *  number of times that \p first must be incremented until it is equal to
+ *  \p last.
+ *
+ *  \param first The beginning of an input range of interest.
+ *  \param last The end of an input range of interest.
+ *  \return The distance between the beginning and end of the input range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \pre If \c InputIterator meets the requirements of random access iterator, \p last shall be reachable from \p first or
+ *       \p first shall be reachable from \p last; otherwise, \p last shall be reachable from \p first.
+ *
+ *  The following code snippet demonstrates how to use \p distance to compute
+ *  the distance to one iterator from another.
+ *
+ *  \code
+ *  #include <thrust/distance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator iter1 = vec.begin();
+ *  thrust::device_vector<int>::iterator iter2 = iter1 + 7;
+ *
+ *  int d = thrust::distance(iter1, iter2);
+ *
+ *  // d is 7
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/distance.html
+ */
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last);
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/detail/distance.inl>
+
diff --git a/compat/thrust/equal.h b/compat/thrust/equal.h
new file mode 100644
index 0000000..e96946f
--- /dev/null
+++ b/compat/thrust/equal.h
@@ -0,0 +1,236 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file equal.h
+ *  \brief Equality between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup comparisons
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p equal to test
+ *  two ranges for equality using the \p thrust::host execution policy:
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
+ *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
+ *  ...
+ *  bool result = thrust::equal(thrust::host, A1, A1 + 7, A2);
+ *
+ *  // result == false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p equal to test
+ *  two ranges for equality.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  ...
+ *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
+ *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
+ *  ...
+ *  bool result = thrust::equal(A1, A1 + 7, A2);
+ *
+ *  // result == false
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template <typename InputIterator1, typename InputIterator2>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>,
+ *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param binary_pred Binary predicate used to test element equality.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal to compare the
+ *  elements in two ranges modulo 2 using the \p thrust::host execution policy.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  __host__ __device__
+ *  struct compare_modulo_two
+ *  {
+ *    bool operator()(int x, int y)
+ *    {
+ *      return (x % 2) == (y % 2);
+ *    }
+ *  };
+ *  ...
+ *  int x[5] = {0, 2, 4, 6, 8, 10};
+ *  int y[5] = {1, 3, 5, 7, 9, 11};
+ *
+ *  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());
+ *
+ *  // result is true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
+
+
+/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
+ *  compared element-by-element, and otherwise returns \c false.
+ *
+ *  This version of \p equal returns \c true if and only if for every
+ *  iterator \c i in <tt>[first1, last1)</tt>,
+ *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param binary_pred Binary predicate used to test element equality.
+ *  \return \c true, if the sequences are equal; \c false, otherwise.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p equal to compare the
+ *  elements in two ranges modulo 2.
+ *
+ *  \code
+ *  #include <thrust/equal.h>
+ *  
+ *  __host__ __device__
+ *  struct compare_modulo_two
+ *  {
+ *    bool operator()(int x, int y)
+ *    {
+ *      return (x % 2) == (y % 2);
+ *    }
+ *  };
+ *  ...
+ *  int x[5] = {0, 2, 4, 6, 8, 10};
+ *  int y[5] = {1, 3, 5, 7, 9, 11};
+ *
+ *  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());
+ *
+ *  // result is true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/equal.html
+ */
+template <typename InputIterator1, typename InputIterator2, 
+          typename BinaryPredicate>
+bool equal(InputIterator1 first1, InputIterator1 last1,
+           InputIterator2 first2, BinaryPredicate binary_pred);
+
+
+/*! \} // end comparisons
+ *  \} // end reductions
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/equal.inl>
+
diff --git a/compat/thrust/execution_policy.h b/compat/thrust/execution_policy.h
new file mode 100644
index 0000000..a5b61e9
--- /dev/null
+++ b/compat/thrust/execution_policy.h
@@ -0,0 +1,351 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/execution_policy.h
+ *  \brief Thrust execution policies.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// get the definition of thrust::execution_policy
+#include <thrust/detail/execution_policy.h>
+
+// #include the host system's execution_policy header
+#define __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_HOST_SYSTEM_ROOT/execution_policy.h>
+#include __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
+#undef __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
+
+// #include the device system's execution_policy.h header
+#define __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/execution_policy.h>
+#include __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
+
+namespace thrust
+{
+
+
+/*! \cond
+ */
+
+
+namespace detail
+{
+
+
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::detail::par_t host_t;
+
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::detail::par_t device_t;
+
+
+} // end detail
+
+
+/*! \endcond
+ */
+
+
+/*! \addtogroup execution_policies Parallel Execution Policies
+ *  \{
+ */
+
+
+// define execution_policy for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+/*! \p execution_policy is the base class for all Thrust parallel execution policies
+ *  like \p thrust::host, \p thrust::device, and each backend system's tag type.
+ *
+ *  Custom user-defined backends should derive a policy from this type in order to
+ *  interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy
+ *  from \p thrust::execution_policy to implement a backend which only implements \p for_each:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // can't dispatch thrust::transform because no overload exists for my_policy:
+ *    //thrust::transform(exec, data, data, + 4, data, thrust::identity<int>()); // error!
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see device_execution_policy
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::detail::execution_policy_base<DerivedPolicy>
+{};
+#endif
+
+
+/*! \p host_execution_policy is the base class for all Thrust parallel execution policies
+ *  which are derived from Thrust's default host backend system configured with the \p THRUST_HOST_SYSTEM
+ *  macro.
+ *
+ *  Custom user-defined backends which wish to inherit the functionality of Thrust's host backend system
+ *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy from
+ *  \p thrust::host_execution_policy to implement a backend which specializes \p for_each while inheriting
+ *  the behavior of every other algorithm from the host system:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::host_execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::host_execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // dispatch thrust::transform whose behavior our policy inherits
+ *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see execution_policy
+ *  \see device_execution_policy
+ */
+template<typename DerivedPolicy>
+  struct host_execution_policy
+    : thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p device_execution_policy is the base class for all Thrust parallel execution policies
+ *  which are derived from Thrust's default device backend system configured with the \p THRUST_DEVICE_SYSTEM
+ *  macro.
+ *
+ *  Custom user-defined backends which wish to inherit the functionality of Thrust's device backend system
+ *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
+ *
+ *  The following code snippet demonstrates how to derive a standalone custom execution policy from
+ *  \p thrust::device_execution_policy to implement a backend which specializes \p for_each while inheriting
+ *  the behavior of every other algorithm from the device system:
+ *
+ *  \code
+ *  #include <thrust/execution_policy.h>
+ *  #include <iostream>
+ *
+ *  // define a type derived from thrust::device_execution_policy to distinguish our custom execution policy:
+ *  struct my_policy : thrust::device_execution_policy<my_policy> {};
+ *
+ *  // overload for_each on my_policy
+ *  template<typename Iterator, typename Function>
+ *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
+ *  {
+ *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
+ *
+ *    for(; first < last; ++first)
+ *    {
+ *      f(*first);
+ *    }
+ *
+ *    return first;
+ *  }
+ *
+ *  struct ignore_argument
+ *  {
+ *    void operator()(int) {}
+ *  };
+ *
+ *  int main()
+ *  {
+ *    int data[4];
+ *
+ *    // dispatch thrust::for_each using our custom policy:
+ *    my_policy exec;
+ *    thrust::for_each(exec, data, data + 4, ignore_argument());
+ *
+ *    // dispatch thrust::transform whose behavior our policy inherits
+ *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see execution_policy
+ *  \see host_execution_policy
+ */
+template<typename DerivedPolicy>
+  struct device_execution_policy
+    : thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p thrust::host is the default parallel execution policy associated with Thrust's host backend system
+ *  configured by the \p THRUST_HOST_SYSTEM macro.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
+ *  algorithm dispatch at Thrust's host system by providing \p thrust::host as an algorithm parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
+ *  \p thrust::host_vector.
+ *
+ *  Note that even though \p thrust::host targets the host CPU, it is a parallel execution policy. That is,
+ *  the order that an algorithm invokes functors or dereferences iterators is not defined.
+ *
+ *  The type of \p thrust::host is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::host to explicitly dispatch an invocation
+ *  of \p thrust::for_each to the host backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::host, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see thrust::device
+ */
+static const detail::host_t host;
+
+
+/*! \p thrust::device is the default parallel execution policy associated with Thrust's device backend system
+ *  configured by the \p THRUST_DEVICE_SYSTEM macro.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
+ *  algorithm dispatch at Thrust's device system by providing \p thrust::device as an algorithm parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
+ *  \p thrust::device_vector or to avoid wrapping e.g. raw pointers allocated by the CUDA API with types
+ *  such as \p thrust::device_ptr.
+ *
+ *  The user must take care to guarantee that the iterators provided to an algorithm are compatible with
+ *  the device backend system. For example, raw pointers allocated by <tt>std::malloc</tt> typically
+ *  cannot be dereferenced by a GPU. For this reason, raw pointers allocated by host APIs should not be mixed
+ *  with a \p thrust::device algorithm invocation when the device backend is CUDA.
+ *
+ *  The type of \p thrust::device is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::device to explicitly dispatch an invocation
+ *  of \p thrust::for_each to the device backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec[3];
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::device, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see host_execution_policy
+ *  \see thrust::device
+ */
+static const detail::device_t device;
+
+
+/*! \}
+ */
+
+
+} // end thrust
+
diff --git a/compat/thrust/extrema.h b/compat/thrust/extrema.h
new file mode 100644
index 0000000..335bcd1
--- /dev/null
+++ b/compat/thrust/extrema.h
@@ -0,0 +1,798 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file extrema.h
+ *  \brief Functions for computing computing extremal values
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! This version of \p min returns the smaller of two values, given a comparison operation.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \param comp A comparison operation.
+ *  \return The smaller element.
+ *
+ *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min to compute the smaller of two
+ *  key-value objects.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value a = {13, 0};
+ *  key_value b = { 7, 1);
+ *
+ *  key_value smaller = thrust::min(a, b, compare_key_value());
+ *
+ *  // smaller is {7, 1}
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see max
+ */
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
+
+
+/*! This version of \p min returns the smaller of two values.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \return The smaller element.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min to compute the smaller of two
+ *  integers.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int a = 13;
+ *  int b = 7;
+ *
+ *  int smaller = thrust::min(a, b);
+ *
+ *  // smaller is 7
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see max
+ */
+template<typename T>
+__host__ __device__
+  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
+
+
+/*! This version of \p max returns the larger of two values, given a comparison operation.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \param comp A comparison operation.
+ *  \return The larger element.
+ *
+ *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max to compute the larger of two
+ *  key-value objects.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value a = {13, 0};
+ *  key_value b = { 7, 1);
+ *
+ *  key_value larger = thrust::max(a, b, compare_key_value());
+ *
+ *  // larger is {13, 0}
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see min
+ */
+template<typename T, typename BinaryPredicate>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
+
+
+/*! This version of \p max returns the larger of two values.
+ *  \param lhs The first value to compare.
+ *  \param rhs The second value to compare.
+ *  \return The larger element.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max to compute the larger of two
+ *  integers.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int a = 13;
+ *  int b = 7;
+ *
+ *  int larger = thrust::min(a, b);
+ *
+ *  // larger is 13
+ *  \endcode
+ *
+ *  \note Returns the first argument when the arguments are equivalent.
+ *  \see min
+ */
+template<typename T>
+__host__ __device__
+  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup extrema
+ *  \ingroup reductions
+ *  \{
+ */
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::min_element(thrust::host, data, data + 6);
+ *
+ *  // result is data + 1
+ *  // *result is 0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::min_element(data, data + 6);
+ *
+ *  // result is data + 1
+ *  // *result is 0
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template <typename ForwardIterator>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min_element to find the smallest element
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *smallest = thrust::min_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // smallest == data + 1
+ *  // *smallest == {0,7}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p min_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p min_element to find the smallest element
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *smallest = thrust::min_element(data, data + 4, compare_key_value());
+ *
+ *  // smallest == data + 1
+ *  // *smallest == {0,7}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/min_element.html 
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  greater than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam A Thrust backend system.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::max_element(thrust::host, data, data + 6);
+ *
+ *  // *result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  greater than another. This version compares objects using \c operator<. Specifically,
+ *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int *result = thrust::max_element(data, data + 6);
+ *
+ *  // *result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template <typename ForwardIterator>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
+ *  \c false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max_element to find the largest element
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *largest = thrust::max_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // largest == data + 3
+ *  // *largest == {6,1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
+ *  It returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
+ *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
+ *  empty range.
+ *
+ *  The two versions of \p max_element differ in how they define whether one element is
+ *  less than another. This version compares objects using a function object \p comp.
+ *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
+ *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
+ *  \c false.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p max_element to find the largest element
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  key_value *largest = thrust::max_element(data, data + 4, compare_key_value());
+ *
+ *  // largest == data + 3
+ *  // *largest == {6,1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/max_element.html 
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
+                            BinaryPredicate comp);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  thrust::pair<int *, int *> result = thrust::minmax_element(thrust::host, data, data + 6);
+ *
+ *  // result.first is data + 1
+ *  // result.second is data + 5
+ *  // *result.first is 0
+ *  // *result.second is 3
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \c ForwardIterator's \c value_type is a model of
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  thrust::pair<int *, int *> result = thrust::minmax_element(data, data + 6);
+ *
+ *  // result.first is data + 1
+ *  // result.second is data + 5
+ *  // *result.first is 0
+ *  // *result.second is 3
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template <typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
+                                                             ForwardIterator last);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
+ *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/pair.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(thrust::host, data, data + 4, compare_key_value());
+ *
+ *  // extrema.first   == data + 1
+ *  // *extrema.first  == {0,7}
+ *  // extrema.second  == data + 3
+ *  // *extrema.second == {6,1}
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
+
+
+/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
+ *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
+ *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
+ *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp A binary predicate used for comparison.
+ *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
+ *          if it is not an empty range; \p last, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
+ *  of a collection of key-value pairs.
+ *
+ *  \code
+ *  #include <thrust/extrema.h>
+ *  #include <thrust/pair.h>
+ *
+ *  struct key_value
+ *  {
+ *    int key;
+ *    int value;
+ *  };
+ *
+ *  struct compare_key_value
+ *  {
+ *    __host__ __device__
+ *    bool operator()(key_value lhs, key_value rhs)
+ *    {
+ *      return lhs.key < rhs.key;
+ *    }
+ *  };
+ *
+ *  ...
+ *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
+ *
+ *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(data, data + 4, compare_key_value());
+ *
+ *  // extrema.first   == data + 1
+ *  // *extrema.first  == {0,7}
+ *  // extrema.second  == data + 3
+ *  // *extrema.second == {6,1}
+ *  \endcode
+ *
+ *  \see min_element
+ *  \see max_element
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
+ */
+template <typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp);
+
+/*! \} // end extrema
+ *  \} // end reductions
+ */
+
+} // end thrust
+
+#include <thrust/detail/extrema.inl>
+#include <thrust/detail/minmax.h>
+
diff --git a/compat/thrust/fill.h b/compat/thrust/fill.h
new file mode 100644
index 0000000..b492cec
--- /dev/null
+++ b/compat/thrust/fill.h
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.h
+ *  \brief Fills a range with a constant value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \addtogroup filling
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p fill assigns the value \p value to every element in
+ *  the range <tt>[first, last)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, last)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be copied.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill(thrust::device, v.begin(), v.end(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see \c fill_n
+ *  \see \c uninitialized_fill
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value);
+
+
+/*! \p fill assigns the value \p value to every element in
+ *  the range <tt>[first, last)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, last)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param value The value to be copied.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value.
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill(v.begin(), v.end(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see \c fill_n
+ *  \see \c uninitialized_fill
+ */
+template<typename ForwardIterator, typename T>
+  void fill(ForwardIterator first,
+            ForwardIterator last,
+            const T &value);
+
+
+/*! \p fill_n assigns the value \p value to every element in
+ *  the range <tt>[first, first+n)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, first+n)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the sequence.
+ *  \param value The value to be copied.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill_n(thrust::device, v.begin(), v.size(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see \c fill
+ *  \see \c uninitialized_fill_n
+ */
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value);
+
+
+/*! \p fill_n assigns the value \p value to every element in
+ *  the range <tt>[first, first+n)</tt>. That is, for every
+ *  iterator \c i in <tt>[first, first+n)</tt>, it performs
+ *  the assignment <tt>*i = value</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the sequence.
+ *  \param value The value to be copied.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
+ *  elements to a given value.
+ *
+ *  \code
+ *  #include <thrust/fill.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(4);
+ *  thrust::fill_n(v.begin(), v.size(), 137);
+ *
+ *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see \c fill
+ *  \see \c uninitialized_fill_n
+ */
+template<typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(OutputIterator first,
+                        Size n,
+                        const T &value);
+
+
+/*! \} // end filling
+ *  \} // transformations
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/fill.inl>
+
diff --git a/compat/thrust/find.h b/compat/thrust/find.h
new file mode 100644
index 0000000..fa01ded
--- /dev/null
+++ b/compat/thrust/find.h
@@ -0,0 +1,382 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief Locating values in (unsorted) ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p find returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param value The value to find.
+ *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator's \c value_type is equality comparable to type \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 3); // returns input.first() + 2
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 5); // returns input.first() + 1
+ *  iter = thrust::find(thrust::device, input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see find_if
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename T>
+InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+
+/*! \p find returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param value The value to find.
+ *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator's \c value_type is equality comparable to type \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find(input.begin(), input.end(), 3); // returns input.first() + 2
+ *  iter = thrust::find(input.begin(), input.end(), 5); // returns input.first() + 1
+ *  iter = thrust::find(input.begin(), input.end(), 9); // returns input.end()
+ *  \endcode
+ *
+ *  \see find_if
+ *  \see mismatch
+ */
+template <typename InputIterator, typename T>
+InputIterator find(InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+
+/*! \p find_if returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
+ *
+ *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.end()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if_not
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p find_if returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if(input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
+ *
+ *  iter = thrust::find_if(input.begin(), input.end(), greater_than_ten());  // returns input.end()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if_not
+ *  \see mismatch
+ */
+template <typename InputIterator, typename Predicate>
+InputIterator find_if(InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p find_if_not returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
+ *  or \c last if no such iterator exists.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first()
+ *
+ *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.first()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ *  \see mismatch
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+
+/*! \p find_if_not returns the first iterator \c i in the range 
+ *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
+ *  or \c last if no such iterator exists.
+ *
+ *  \param first Beginning of the sequence to search.
+ *  \param last End of the sequence to search.
+ *  \param pred A predicate used to test range elements.
+ *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/find.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct greater_than_four
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 4;
+ *    }
+ *  };
+ *
+ *  struct greater_than_ten
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x > 10;
+ *    }
+ *  };
+ *
+ *  ...
+ *  thrust::device_vector<int> input(4);
+ *
+ *  input[0] = 0;
+ *  input[1] = 5;
+ *  input[2] = 3;
+ *  input[3] = 7;
+ *
+ *  thrust::device_vector<int>::iterator iter;
+ *
+ *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_four()); // returns input.first()
+ *
+ *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_ten());  // returns input.first()
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ *  \see mismatch
+ */
+template <typename InputIterator, typename Predicate>
+InputIterator find_if_not(InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+/*! \} // end searching
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/find.inl>
+
diff --git a/compat/thrust/for_each.h b/compat/thrust/for_each.h
new file mode 100644
index 0000000..efab9d8
--- /dev/null
+++ b/compat/thrust/for_each.h
@@ -0,0 +1,278 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Applies a function to each element in a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup modifying
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p for_each applies the function object \p f to each element
+ *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution. For this reason, this version of \p for_each
+ *  does not return a copy of the function object.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param f The function object to apply to the range <tt>[first, last)</tt>.
+ *  \return last
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each to print the elements
+ *  of a \p std::device_vector using the \p thrust::device parallelization policy:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *  ...
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::device, d_vec.begin(), d_vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each_n
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f);
+
+
+/*! \p for_each_n applies the function object \p f to each element
+ *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the input sequence.
+ *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam Size is an integral type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each_n to print the elements
+ *  of a \p device_vector using the \p thrust::device parallelization policy.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each_n(thrust::device, d_vec.begin(), d_vec.size(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator first,
+                         Size n,
+                         UnaryFunction f);
+
+/*! \p for_each applies the function object \p f to each element
+ *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution. For this reason, this version of \p for_each
+ *  does not return a copy of the function object.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param f The function object to apply to the range <tt>[first, last)</tt>.
+ *  \return last
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each to print the elements
+ *  of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <stdio.h>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each(d_vec.begin(), d_vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each_n
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f);
+
+
+/*! \p for_each_n applies the function object \p f to each element
+ *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
+ *  is ignored. Unlike the C++ Standard Template Library function
+ *  <tt>std::for_each</tt>, this version offers no guarantee on
+ *  order of execution.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param n The size of the input sequence.
+ *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
+ *  \return <tt>first + n</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam Size is an integral type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *          and \p UnaryFunction does not apply any non-constant operation through its argument.
+ *
+ *  The following code snippet demonstrates how to use \p for_each_n to print the elements
+ *  of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <stdio.h>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      // note that using printf in a __device__ function requires
+ *      // code compiled for a GPU with compute capability 2.0 or
+ *      // higher (nvcc --arch=sm_20)
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  thrust::device_vector<int> d_vec(3);
+ *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
+ *
+ *  thrust::for_each_n(d_vec.begin(), d_vec.size(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ *
+ *  \see for_each
+ *  \see http://www.sgi.com/tech/stl/for_each.html
+ */
+template<typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(InputIterator first,
+                         Size n,
+                         UnaryFunction f);
+
+/*! \} // end modifying
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/for_each.inl>
+
diff --git a/compat/thrust/functional.h b/compat/thrust/functional.h
new file mode 100644
index 0000000..b3d47f9
--- /dev/null
+++ b/compat/thrust/functional.h
@@ -0,0 +1,1079 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file functional.h
+ *  \brief Function objects and tools for manipulating them
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <functional>
+#include <thrust/detail/functional/placeholder.h>
+
+namespace thrust
+{
+
+/*! \addtogroup function_objects Function Objects
+ */
+
+template<typename Operation> struct unary_traits;
+
+template<typename Operation> struct binary_traits;
+
+/*! \addtogroup function_object_adaptors Function Object Adaptors
+ *  \ingroup function_objects
+ *  \{
+ */
+
+/*! \p unary_function is an empty base class: it contains no member functions
+ *  or member variables, but only type information. The only reason it exists
+ *  is to make it more convenient to define types that are models of the
+ *  concept Adaptable Unary Function. Specifically, any model of Adaptable
+ *  Unary Function must define nested \c typedefs. Those \c typedefs are
+ *  provided by the base class \p unary_function.
+ *
+ *  The following code snippet demonstrates how to construct an 
+ *  Adaptable Unary Function using \p unary_function.
+ *
+ *  \code
+ *  struct sine : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) { return sinf(x); }
+ *  };
+ *  \endcode
+ *
+ *  \note unary_function is currently redundant with the C++ STL type
+ *  \c std::unary_function. We reserve it here for potential additional
+ *  functionality at a later date.
+ *
+ *  \see http://www.sgi.com/tech/stl/unary_function.html
+ *  \see binary_function
+ */
+template<typename Argument,
+         typename Result>
+  struct unary_function
+    : public std::unary_function<Argument, Result>
+{
+}; // end unary_function
+
+/*! \p binary_function is an empty base class: it contains no member functions
+ *  or member variables, but only type information. The only reason it exists
+ *  is to make it more convenient to define types that are models of the
+ *  concept Adaptable Binary Function. Specifically, any model of Adaptable
+ *  Binary Function must define nested \c typedefs. Those \c typedefs are
+ *  provided by the base class \p binary_function.
+ *
+ *  The following code snippet demonstrates how to construct an 
+ *  Adaptable Binary Function using \p binary_function.
+ *
+ *  \code
+ *  struct exponentiate : public thrust::binary_function<float,float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x, float y) { return powf(x,y); }
+ *  };
+ *  \endcode
+ *
+ *  \note binary_function is currently redundant with the C++ STL type
+ *  \c std::binary_function. We reserve it here for potential additional
+ *  functionality at a later date.
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_function.html
+ *  \see unary_function
+ */
+template<typename Argument1,
+         typename Argument2,
+         typename Result>
+  struct binary_function
+    : public std::binary_function<Argument1, Argument2, Result>
+{
+}; // end binary_function
+
+/*! \}
+ */
+
+
+/*! \addtogroup predefined_function_objects Predefined Function Objects
+ *  \ingroup function_objects
+ */
+
+/*! \addtogroup arithmetic_operations Arithmetic Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p plus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>plus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x+y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x+y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>plus</tt> to sum two
+ *  device_vectors of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                     thrust::plus<float>());
+ *  // V3 is now {76, 77, 78, ..., 1075}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/plus.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct plus : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs + rhs;}
+}; // end plus
+
+/*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x-y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>minus</tt> to subtract
+ *  a device_vector of \c floats from another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                     thrust::minus<float>());
+ *  // V3 is now {-74, -75, -76, ..., -925}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/minus.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct minus : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs - rhs;}
+}; // end minus
+
+/*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x*y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>multiplies</tt> to multiply
+ *  two device_vectors of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                     thrust::multiplies<float>());
+ *  // V3 is now {75, 150, 225, ..., 75000}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/multiplies.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct multiplies : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs * rhs;}
+}; // end multiplies
+
+/*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x/y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>divides</tt> to divide
+ *  one device_vectors of \c floats by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                     thrust::divides<float>());
+ *  // V3 is now {1/75, 2/75, 3/75, ..., 1000/75}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/divides.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct divides : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs / rhs;}
+}; // end divides
+
+/*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x%y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x%y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>modulus</tt> to take
+ *  the modulus of one device_vectors of \c floats by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *  thrust::device_vector<float> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 75);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                     thrust::modulus<int>());
+ *  // V3 is now {1%75, 2%75, 3%75, ..., 1000%75}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/modulus.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct modulus : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs % rhs;}
+}; // end modulus
+
+/*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
+ *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
+ *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p T, then <tt>-x</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>negate</tt> to negate
+ *  the element of a device_vector of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
+ *                     thrust::negate<float>());
+ *  // V2 is now {-1, -2, -3, ..., -1000}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/negate.html
+ *  \see unary_function
+ */
+template<typename T>
+  struct negate : public unary_function<T,T>
+{
+  /*! Function call operator. The return value is <tt>-x</tt>.
+   */
+  __host__ __device__ T operator()(const T &x) const {return -x;}
+}; // end negate
+
+/*! \}
+ */
+
+/*! \addtogroup comparison_operations Comparison Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p equal_to is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>equal_to<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x == y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/equal_to.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct equal_to : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs == rhs;}
+}; // end equal_to
+
+/*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x != y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/not_equal_to.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct not_equal_to : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs != rhs;}
+}; // end not_equal_to
+
+/*! \p greater is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x > y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/greater.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct greater : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs > rhs;}
+}; // end greater
+
+/*! \p less is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x < y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/less.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct less : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs < rhs;}
+}; // end less
+
+/*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x >= y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/greater_equal.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct greater_equal : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs >= rhs;}
+}; // end greater_equal
+
+/*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
+ *  Predicate, which means it is a function object that tests the truth or falsehood
+ *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
+ *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
+ *  <tt>x <= y</tt> and \c false otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  \see http://www.sgi.com/tech/stl/less_equal.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct less_equal : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs <= rhs;}
+}; // end less_equal
+
+/*! \}
+ */
+
+
+/*! \addtogroup logical_operations Logical Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p logical_and is a function object. Specifically, it is an Adaptable Binary Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_and<T></tt> and \c x and \c y are objects of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
+ *  if and only if both \c x and \c y are \c true.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_and.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct logical_and : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs && rhs;}
+}; // end logical_and
+
+/*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
+ *  if and only if either \c x or \c y are \c true.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_or.html
+ *  \see binary_function
+ */
+template<typename T>
+  struct logical_or : public binary_function<T,T,bool>
+{
+  /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
+   */
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs || rhs;}
+}; // end logical_or
+
+/*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
+ *  which means it is a function object that tests the truth or falsehood of some condition.
+ *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
+ *  class \c T (where \c T is convertible to \c bool) then <tt>f(x)</tt> returns \c true
+ *  if and only if \c x is \c false.
+ *
+ *  \tparam T must be convertible to \c bool.
+ *
+ *  The following code snippet demonstrates how to use \p logical_not to transform
+ *  a device_vector of \c bools into its logical complement.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  thrust::device_vector<bool> V;
+ *  ...
+ *  thrust::transform(V.begin(), V.end(), V.begin(), thrust::logical_not<bool>());
+ *  // The elements of V are now the logical complement of what they were prior
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/logical_not.html
+ *  \see unary_function
+ */
+template<typename T>
+  struct logical_not : public unary_function<T,bool>
+{
+  /*! Function call operator. The return value is <tt>!x</tt>.
+   */
+  __host__ __device__ bool operator()(const T &x) const {return !x;}
+}; // end logical_not
+
+/*! \}
+ */
+
+/*! \addtogroup bitwise_operations Bitwise Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p bit_and is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x&y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x&y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_and</tt> to take
+ *  the bitwise AND of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_and<int>());
+ *  // V3 is now {1&13, 2&13, 3&13, ..., 1000%13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T>
+  struct bit_and : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs & rhs;}
+}; // end bit_and
+
+/*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x|y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_or</tt> to take
+ *  the bitwise OR of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_or<int>());
+ *  // V3 is now {1|13, 2|13, 3|13, ..., 1000|13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T>
+  struct bit_or : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs | rhs;}
+}; // end bit_or
+
+/*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
+ *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
+ *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x and \c y are objects of type \p T, then <tt>x^y</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>bit_xor</tt> to take
+ *  the bitwise XOR of one device_vector of \c ints by another.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/fill.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<int> V1(N);
+ *  thrust::device_vector<int> V2(N);
+ *  thrust::device_vector<int> V3(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *  thrust::fill(V2.begin(), V2.end(), 13);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
+ *                    thrust::bit_xor<int>());
+ *  // V3 is now {1^13, 2^13, 3^13, ..., 1000^13}
+ *  \endcode
+ *
+ *  \see binary_function
+ */
+template<typename T>
+  struct bit_xor : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs ^ rhs;}
+}; // end bit_xor
+
+/*! \}
+ */
+
+/*! \addtogroup generalized_identity_operations Generalized Identity Operations
+ *  \ingroup predefined_function_objects
+ *  \{
+ */
+
+/*! \p identity is a Unary Function that represents the identity function: it takes
+ *  a single argument \c x, and returns \c x.
+ *
+ *  \tparam T No requirements on \p T.
+ *
+ *  The following code snippet demonstrates that \p identity returns its
+ *  argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x = 137;
+ *  thrust::identity<int> id;
+ *  assert(x == id(x));
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/identity.html
+ *  \see unary_function
+ */
+template<typename T>
+  struct identity : public unary_function<T,T>
+{
+  /*! Function call operator. The return value is <tt>x</tt>.
+   */
+  __host__ __device__ const T &operator()(const T &x) const {return x;}
+}; // end identity
+
+/*! \p maximum is a function object that takes two arguments and returns the greater
+ *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
+ *  object of class <tt>maximum<T></tt> and \c x and \c y are objects of class \c T
+ *  <tt>f(x,y)</tt> returns \c x if <tt>x > y</tt> and \c y, otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates that \p maximum returns its
+ *  greater argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::maximum<int> mx;
+ *  assert(x == mx(x,y));
+ *  \endcode
+ *
+ *  \see minimum
+ *  \see min
+ *  \see binary_function
+ */
+template<typename T>
+  struct maximum : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>rhs < lhs ? lhs : rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? rhs : lhs;}
+}; // end maximum
+
+/*! \p minimum is a function object that takes two arguments and returns the lesser
+ *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
+ *  object of class <tt>minimum<T></tt> and \c x and \c y are objects of class \c T
+ *  <tt>f(x,y)</tt> returns \c x if <tt>x < y</tt> and \c y, otherwise.
+ *
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates that \p minimum returns its
+ *  lesser argument.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::minimum<int> mn;
+ *  assert(y == mn(x,y));
+ *  \endcode
+ *
+ *  \see maximum
+ *  \see max
+ *  \see binary_function
+ */
+template<typename T>
+  struct minimum : public binary_function<T,T,T>
+{
+  /*! Function call operator. The return value is <tt>lhs < rhs ? lhs : rhs</tt>.
+   */
+  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
+}; // end minimum
+
+/*! \p project1st is a function object that takes two arguments and returns 
+ *  its first argument; the second argument is unused. It is essentially a
+ *  generalization of identity to the case of a Binary Function.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::project1st<int> pj1;
+ *  assert(x == pj1(x,y));
+ *  \endcode
+ *
+ *  \see identity
+ *  \see project2nd
+ *  \see binary_function
+ */
+template<typename T1, typename T2>
+  struct project1st : public binary_function<T1,T2,T1>
+{
+  /*! Function call operator. The return value is <tt>lhs</tt>.
+   */
+  __host__ __device__ const T1 &operator()(const T1 &lhs, const T2 &rhs) const {return lhs;}
+}; // end project1st
+
+/*! \p project2nd is a function object that takes two arguments and returns 
+ *  its second argument; the first argument is unused. It is essentially a
+ *  generalization of identity to the case of a Binary Function.
+ *
+ *  \code
+ *  #include <thrust/functional.h>
+ *  #include <assert.h>
+ *  ...
+ *  int x =  137;
+ *  int y = -137;
+ *  thrust::project2nd<int> pj2;
+ *  assert(y == pj2(x,y));
+ *  \endcode
+ *
+ *  \see identity
+ *  \see project1st
+ *  \see binary_function
+ */
+template<typename T1, typename T2>
+  struct project2nd : public binary_function<T1,T2,T2>
+{
+  /*! Function call operator. The return value is <tt>rhs</tt>.
+   */
+  __host__ __device__ const T2 &operator()(const T1 &lhs, const T2 &rhs) const {return rhs;}
+}; // end project2nd
+
+/*! \}
+ */
+
+
+// odds and ends
+
+/*! \addtogroup function_object_adaptors
+ *  \{
+ */
+
+/*! \p unary_negate is a function object adaptor: it is an Adaptable Predicate
+ *  that represents the logical negation of some other Adaptable Predicate.
+ *  That is: if \c f is an object of class <tt>unary_negate<AdaptablePredicate></tt>,
+ *  then there exists an object \c pred of class \c AdaptablePredicate such
+ *  that <tt>f(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
+ *  There is rarely any reason to construct a <tt>unary_negate</tt> directly;
+ *  it is almost always easier to use the helper function not1.
+ *
+ *  \see http://www.sgi.com/tech/stl/unary_negate.html
+ *  \see not1
+ */
+template<typename Predicate>
+struct unary_negate 
+    : public thrust::unary_function<typename Predicate::argument_type, bool>
+{
+  /*! Constructor takes a \p Predicate object to negate.
+   *  \param p The \p Predicate object to negate.
+   */
+  __host__ __device__
+  explicit unary_negate(Predicate p) : pred(p){}
+
+  /*! Function call operator. The return value is <tt>!pred(x)</tt>.
+   */
+  __host__ __device__
+  bool operator()(const typename Predicate::argument_type& x) { return !pred(x); }
+
+  /*! \cond */
+  Predicate pred;
+  /*! \endcond */
+}; // end unary_negate
+
+/*! \p not1 is a helper function to simplify the creation of Adaptable Predicates:
+ *  it takes an Adaptable Predicate \p pred as an argument and returns a new Adaptable
+ *  Predicate that represents the negation of \p pred. That is: if \c pred is an object
+ *  of a type which models Adaptable Predicate, then the the type of the result
+ *  \c npred of <tt>not1(pred)</tt> is also a model of Adaptable Predicate and
+ *  <tt>npred(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
+ *
+ *  \param pred The Adaptable Predicate to negate.
+ *  \return A new object, <tt>npred</tt> such that <tt>npred(x)</tt> always returns
+ *          the same value as <tt>!pred(x)</tt>.
+ *
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptablePredicate.html">Adaptable Predicate</a>.
+ *
+ *  \see unary_negate
+ *  \see not2
+ */
+template<typename Predicate>
+  __host__ __device__
+  unary_negate<Predicate> not1(const Predicate &pred);
+
+/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary 
+ *  Predicate that represents the logical negation of some other Adaptable
+ *  Binary Predicate. That is: if \c f is an object of class <tt>binary_negate<AdaptablePredicate></tt>,
+ *  then there exists an object \c pred of class \c AdaptableBinaryPredicate
+ *  such that <tt>f(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
+ *  There is rarely any reason to construct a <tt>binary_negate</tt> directly;
+ *  it is almost always easier to use the helper function not2.
+ *
+ *  \see http://www.sgi.com/tech/stl/binary_negate.html
+ */
+template<typename Predicate>
+struct binary_negate
+    : public thrust::binary_function<typename Predicate::first_argument_type,
+                                     typename Predicate::second_argument_type,
+                                     bool>
+{
+  /*! Constructor takes a \p Predicate object to negate.
+   *  \param p The \p Predicate object to negate.
+   */
+  __host__ __device__
+  explicit binary_negate(Predicate p) : pred(p){}
+
+  /*! Function call operator. The return value is <tt>!pred(x,y)</tt>.
+   */
+  __host__ __device__
+  bool operator()(const typename Predicate::first_argument_type& x, const typename Predicate::second_argument_type& y)
+  { 
+      return !pred(x,y); 
+  }
+
+  /*! \cond */
+  Predicate pred;
+  /*! \endcond */
+}; // end binary_negate
+
+/*! \p not2 is a helper function to simplify the creation of Adaptable Binary Predicates:
+ *  it takes an Adaptable Binary Predicate \p pred as an argument and returns a new Adaptable
+ *  Binary Predicate that represents the negation of \p pred. That is: if \c pred is an object
+ *  of a type which models Adaptable Binary Predicate, then the the type of the result
+ *  \c npred of <tt>not2(pred)</tt> is also a model of Adaptable Binary Predicate and
+ *  <tt>npred(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
+ *
+ *  \param pred The Adaptable Binary Predicate to negate.
+ *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
+ *          the same value as <tt>!pred(x,y)</tt>.
+ *
+ *  \tparam Binary Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptableBinaryPredicate.html">Adaptable Binary Predicate</a>.
+ *
+ *  \see binary_negate
+ *  \see not1
+ */
+template<typename BinaryPredicate>
+  __host__ __device__
+  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred);
+
+/*! \}
+ */
+
+
+/*! \addtogroup placeholder_objects Placeholder Objects
+ *  \ingroup function_objects
+ *  \{
+ */
+
+
+/*! \namespace placeholders
+ *  \brief Facilities for constructing simple functions inline.
+ *
+ *  Objects in the \p thrust::placeholders namespace may be used to create simple arithmetic functions inline
+ *  in an algorithm invocation. Combining placeholders such as \p _1 and \p _2 with arithmetic operations such as \c +
+ *  creates an unnamed function object which applies the operation to their arguments.
+ *
+ *  The type of placeholder objects is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use the placeholders \p _1 and \p _2 with \p thrust::transform
+ *  to implement the SAXPY computation:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> x(4), y(4);
+ *    x[0] = 1;
+ *    x[1] = 2;
+ *    x[2] = 3;
+ *    x[3] = 4;
+ *    
+ *    y[0] = 1;
+ *    y[1] = 1;
+ *    y[2] = 1;
+ *    y[3] = 1;
+ *
+ *    float a = 2.0f;
+ *
+ *    using namespace thrust::placeholders;
+ *
+ *    thrust::transform(x.begin(), x.end(), y.begin(), y.begin(),
+ *      a * _1 + 2
+ *    );
+ *
+ *    // y is now {3, 5, 7, 9}
+ *  }
+ *  \endcode
+ */
+namespace placeholders
+{
+
+
+/*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
+ */
+static const thrust::detail::functional::placeholder<0>::type _1;
+
+
+/*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
+ */
+static const thrust::detail::functional::placeholder<1>::type _2;
+
+
+/*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
+ */
+static const thrust::detail::functional::placeholder<2>::type _3;
+
+
+/*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
+ */
+static const thrust::detail::functional::placeholder<3>::type _4;
+
+
+/*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
+ */
+static const thrust::detail::functional::placeholder<4>::type _5;
+
+
+/*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
+ */
+static const thrust::detail::functional::placeholder<5>::type _6;
+
+
+/*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
+ */
+static const thrust::detail::functional::placeholder<6>::type _7;
+
+
+/*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
+ */
+static const thrust::detail::functional::placeholder<7>::type _8;
+
+
+/*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
+ */
+static const thrust::detail::functional::placeholder<8>::type _9;
+
+
+/*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
+ */
+static const thrust::detail::functional::placeholder<9>::type _10;
+
+
+} // end placeholders
+
+
+/*! \} // placeholder_objects
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/functional.inl>
+#include <thrust/detail/functional/operators.h>
+
diff --git a/compat/thrust/gather.h b/compat/thrust/gather.h
new file mode 100644
index 0000000..f2b8233
--- /dev/null
+++ b/compat/thrust/gather.h
@@ -0,0 +1,438 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file gather.h
+ *  \brief Irregular copying from a source range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup gathering
+ *  \ingroup copying
+ *  \{
+ */
+
+
+/*! \p gather copies elements from a source array into a destination range according 
+ *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
+ *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather is the inverse of thrust::scatter.
+ *
+ *  The following code snippet demonstrates how to use \p gather to reorder
+ *  a range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // gather all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::gather(thrust::device,
+ *                 d_map.begin(), d_map.end(),
+ *                 d_values.begin(),
+ *                 d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                        InputIterator                                               map_first,
+                        InputIterator                                               map_last,
+                        RandomAccessIterator                                        input_first,
+                        OutputIterator                                              result);
+
+
+/*! \p gather copies elements from a source array into a destination range according 
+ *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
+ *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather is the inverse of thrust::scatter.
+ *
+ *  The following code snippet demonstrates how to use \p gather to reorder
+ *  a range.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // gather all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::gather(d_map.begin(), d_map.end(),
+ *                 d_values.begin(),
+ *                 d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ */
+template<typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(InputIterator        map_first,
+                        InputIterator        map_last,
+                        RandomAccessIterator input_first,
+                        OutputIterator       result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
+ *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
+ *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range using the \p thrust::device execution policy:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // select elements at even-indexed locations
+ *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(thrust::device,
+ *                    d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
+ *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
+ *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // select elements at even-indexed locations
+ *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
+ *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
+ *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range based on an arbitrary selection function using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // we will select an element when our stencil is even
+ *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(thrust::device,
+ *                    d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin(),
+ *                    is_even());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              map_first,
+                           InputIterator1                                              map_last,
+                           InputIterator2                                              stencil,
+                           RandomAccessIterator                                        input_first,
+                           OutputIterator                                              result,
+                           Predicate                                                   pred);
+
+
+/*! \p gather_if conditionally copies elements from a source array into a destination 
+ *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
+ *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
+ *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
+ *  \p RandomAccessIterator must permit random access.
+ *
+ *  \param map_first Beginning of the range of gather locations.
+ *  \param map_last End of the range of gather locations.
+ *  \param stencil Beginning of the range of predicate values.
+ *  \param input_first Beginning of the source range.
+ *  \param result Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *
+ *  \remark \p gather_if is the inverse of \p scatter_if.
+ *
+ *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
+ *  an input range based on an arbitrary selection function.
+ *
+ *  \code
+ *  #include <thrust/gather.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *
+ *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // we will select an element when our stencil is even
+ *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
+ *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
+ *
+ *  // map all even indices into the first half of the range
+ *  // and odd indices to the last half of the range
+ *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10, 7);
+ *  thrust::gather_if(d_map.begin(), d_map.end(),
+ *                    d_stencil.begin(),
+ *                    d_values.begin(),
+ *                    d_output.begin(),
+ *                    is_even());
+ *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
+ *  \endcode
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(InputIterator1       map_first,
+                           InputIterator1       map_last,
+                           InputIterator2       stencil,
+                           RandomAccessIterator input_first,
+                           OutputIterator       result,
+                           Predicate            pred);
+
+/*! \} // gathering
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/gather.inl>
+
diff --git a/compat/thrust/generate.h b/compat/thrust/generate.h
new file mode 100644
index 0000000..1d52721
--- /dev/null
+++ b/compat/thrust/generate.h
@@ -0,0 +1,211 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generate.h
+ *  \brief Fills a range with values "generated" from a function of no arguments
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element in the range of interest.
+ *  \param last The last element in the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate(thrust::host, v.begin(), v.end(), rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate_n
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Generator>
+  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+
+/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,last)</tt>.
+ *
+ *  \param first The first element in the range of interest.
+ *  \param last The last element in the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,last)</tt>.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand.
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate(v.begin(), v.end(), rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate_n
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename ForwardIterator,
+         typename Generator>
+  void generate(ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+
+/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element in the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,first + n)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Size is an integral type (either signed or unsigned).
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  #include <cstdlib>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate_n(thrust::host, v.begin(), 10, rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename DerivedPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+
+/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
+ *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
+ *
+ *  \param first The first element in the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param gen A function argument, taking no parameters, used to generate values to assign to
+ *             elements in the range <tt>[first,first + n)</tt>.
+ *
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Size is an integral type (either signed or unsigned).
+ *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *
+ *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
+ *  using the standard C library function \c rand.
+ *
+ *  \code
+ *  #include <thrust/generate.h>
+ *  #include <thrust/host_vector.h>
+ *  #include <stdlib.h>
+ *  ...
+ *  thrust::host_vector<int> v(10);
+ *  srand(13);
+ *  thrust::generate_n(v.begin(), 10, rand);
+ *
+ *  // the elements of v are now pseudo-random numbers
+ *  \endcode
+ *
+ *  \see generate
+ *  \see http://www.sgi.com/tech/stl/generate.html
+ */
+template<typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+
+/*! \} // end transformations
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/generate.inl>
+
diff --git a/compat/thrust/host_vector.h b/compat/thrust/host_vector.h
new file mode 100644
index 0000000..11b1ae0
--- /dev/null
+++ b/compat/thrust/host_vector.h
@@ -0,0 +1,424 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file host_vector.h
+ *  \brief A dynamically-sizable array of elements which reside in the "host" memory space
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <memory>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of device_vector
+template<typename T, typename Alloc> class device_vector;
+
+/*! \addtogroup container_classes Container Classes
+ *  \addtogroup host_containers Host Containers
+ *  \ingroup container_classes
+ *  \{
+ */
+
+/*! A \p host_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p host_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p host_vector resides in the memory
+ *  space of the host associated with a parallel device.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see device_vector
+ */
+template<typename T, typename Alloc = std::allocator<T> >
+  class host_vector
+    : public detail::vector_base<T,Alloc>
+{
+  private:
+    typedef detail::vector_base<T,Alloc> Parent;
+
+  public:
+    /*! \cond */
+    typedef typename Parent::size_type  size_type;
+    typedef typename Parent::value_type value_type;
+    /*! \endcond */
+
+    /*! This constructor creates an empty \p host_vector.
+     */
+    __host__
+    host_vector(void)
+      :Parent() {}
+
+    /*! This constructor creates a \p host_vector with the given
+     *  size.
+     *  \param n The number of elements to initially craete.
+     */
+    __host__
+    explicit host_vector(size_type n)
+      :Parent(n) {}
+
+    /*! This constructor creates a \p host_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     */
+    __host__
+    explicit host_vector(size_type n, const value_type &value)
+      :Parent(n,value) {}
+
+    /*! Copy constructor copies from an exemplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     */
+    __host__
+    host_vector(const host_vector &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar \p host_vector.
+     *  \param v The \p host_vector to copy.
+     */
+    __host__
+    host_vector &operator=(const host_vector &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! Copy constructor copies from an exemplar \p host_vector with different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const host_vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar \p host_vector with different type.
+     *  \param v The \p host_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! Copy constructor copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const std::vector<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
+
+    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
+     *  \param v The <tt>std::vector</tt> to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this;}
+
+    /*! Copy constructor copies from an exemplar \p device_vector with possibly different type.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector(const device_vector<OtherT,OtherAlloc> &v);
+
+    /*! Assign operator copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     */
+    template<typename OtherT, typename OtherAlloc>
+    __host__
+    host_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    { Parent::operator=(v); return *this; }
+
+    /*! This constructor builds a \p host_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    __host__
+    host_vector(InputIterator first, InputIterator last)
+      :Parent(first, last) {}
+
+// declare these members for the purpose of Doxygenating them
+// they actually exist in a derived-from class
+#if 0
+    /*! \brief Resizes this vector to the specified number of elements.
+     *  \param new_size Number of elements this vector should contain.
+     *  \param x Data with which new elements should be populated.
+     *  \throw std::length_error If n exceeds max_size().
+     *
+     *  This method will resize this vector to the specified number of
+     *  elements.  If the number is smaller than this vector's current
+     *  size this vector is truncated, otherwise this vector is
+     *  extended and new elements are populated with given data.
+     */
+    void resize(size_type new_size, const value_type &x = value_type());
+
+    /*! Returns the number of elements in this vector.
+     */
+    size_type size(void) const;
+
+    /*! Returns the size() of the largest possible vector.
+     *  \return The largest possible return value of size().
+     */
+    size_type max_size(void) const;
+
+    /*! \brief If n is less than or equal to capacity(), this call has no effect.
+     *         Otherwise, this method is a request for allocation of additional memory. If
+     *         the request is successful, then capacity() is greater than or equal to
+     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
+     *  \throw std::length_error If n exceeds max_size().
+     */
+    void reserve(size_type n);
+
+    /*! Returns the number of elements which have been reserved in this
+     *  vector.
+     */
+    size_type capacity(void) const;
+
+    /*! This method shrinks the capacity of this vector to exactly
+     *  fit its elements.
+     */
+    void shrink_to_fit(void);
+
+    /*! \brief Subscript access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read/write reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    reference operator[](size_type n);
+
+    /*! \brief Subscript read access to the data contained in this vector_dev.
+     *  \param n The index of the element for which data should be accessed.
+     *  \return Read reference to data.
+     *
+     *  This operator allows for easy, array-style, data access.
+     *  Note that data access with this operator is unchecked and
+     *  out_of_range lookups are not defined.
+     */
+    const_reference operator[](size_type n) const;
+
+    /*! This method returns an iterator pointing to the beginning of
+     *  this vector.
+     *  \return mStart
+     */
+    iterator begin(void);
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator begin(void) const;
+
+    /*! This method returns a const_iterator pointing to the beginning
+     *  of this vector.
+     *  \return mStart
+     */
+    const_iterator cbegin(void) const;
+
+    /*! This method returns a reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    reverse_iterator rbegin(void);
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator rbegin(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to the beginning of
+     *  this vector's reversed sequence.
+     *  \return A const_reverse_iterator pointing to the beginning of this
+     *          vector's reversed sequence.
+     */
+    const_reverse_iterator crbegin(void) const;
+
+    /*! This method returns an iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    iterator end(void);
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator end(void) const;
+
+    /*! This method returns a const_iterator pointing to one element past the
+     *  last of this vector.
+     *  \return begin() + size().
+     */
+    const_iterator cend(void) const;
+
+    /*! This method returns a reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    reverse_iterator rend(void);
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator rend(void) const;
+
+    /*! This method returns a const_reverse_iterator pointing to one element past the
+     *  last of this vector's reversed sequence.
+     *  \return rbegin() + size().
+     */
+    const_reverse_iterator crend(void) const;
+
+    /*! This method returns a const_reference referring to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    const_reference front(void) const;
+
+    /*! This method returns a reference pointing to the first element of this
+     *  vector.
+     *  \return The first element of this vector.
+     */
+    reference front(void);
+
+    /*! This method returns a const reference pointing to the last element of
+     *  this vector.
+     *  \return The last element of this vector.
+     */
+    const_reference back(void) const;
+
+    /*! This method returns a reference referring to the last element of
+     *  this vector_dev.
+     *  \return The last element of this vector.
+     */
+    reference back(void);
+
+    /*! This method returns a pointer to this vector's first element.
+     *  \return A pointer to the first element of this vector.
+     */
+    pointer data(void);
+
+    /*! This method returns a const_pointer to this vector's first element.
+     *  \return a const_pointer to the first element of this vector.
+     */
+    const_pointer data(void) const;
+
+    /*! This method resizes this vector to 0.
+     */
+    void clear(void);
+
+    /*! This method returns true iff size() == 0.
+     *  \return true if size() == 0; false, otherwise.
+     */
+    bool empty(void) const;
+
+    /*! This method appends the given element to the end of this vector.
+     *  \param x The element to append.
+     */
+    void push_back(const value_type &x);
+
+    /*! This method erases the last element of this vector, invalidating
+     *  all iterators and references to it.
+     */
+    void pop_back(void);
+
+    /*! This method swaps the contents of this vector_base with another vector.
+     *  \param v The vector with which to swap.
+     */
+    void swap(host_vector &v);
+
+    /*! This method removes the element at position pos.
+     *  \param pos The position of the element of interest.
+     *  \return An iterator pointing to the new location of the element that followed the element
+     *          at position pos.
+     */
+    iterator erase(iterator pos);
+
+    /*! This method removes the range of elements [first,last) from this vector.
+     *  \param first The beginning of the range of elements to remove.
+     *  \param last The end of the range of elements to remove.
+     *  \return An iterator pointing to the new location of the element that followed the last
+     *          element in the sequence [first,last).
+     */
+    iterator erase(iterator first, iterator last);
+
+    /*! This method inserts a single copy of a given exemplar value at the
+     *  specified position in this vector.
+     *  \param position The insertion position.
+     *  \param x The exemplar element to copy & insert.
+     *  \return An iterator pointing to the newly inserted element.
+     */
+    iterator insert(iterator position, const T &x); 
+
+    /*! This method inserts a copy of an exemplar value to a range at the
+     *  specified position in this vector.
+     *  \param position The insertion position
+     *  \param n The number of insertions to perform.
+     *  \param x The value to replicate and insert.
+     */
+    void insert(iterator position, size_type n, const T &x);
+
+    /*! This method inserts a copy of an input range at the specified position
+     *  in this vector.
+     *  \param position The insertion position.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     */
+    template<typename InputIterator>
+    void insert(iterator position, InputIterator first, InputIterator last);
+
+    /*! This version of \p assign replicates a given exemplar
+     *  \p n times into this vector.
+     *  \param n The number of times to copy \p x.
+     *  \param x The exemplar element to replicate.
+     */
+    void assign(size_type n, const T &x);
+
+    /*! This version of \p assign makes this vector a copy of a given input range.
+     *  \param first The beginning of the range to copy.
+     *  \param last  The end of the range to copy.
+     *
+     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     */
+    template<typename InputIterator>
+    void assign(InputIterator first, InputIterator last);
+
+    /*! This method returns a copy of this vector's allocator.
+     *  \return A copy of the alloctor used by this vector.
+     */
+    allocator_type get_allocator(void) const;
+#endif // end doxygen-only members
+}; // end host_vector
+
+/*! \}
+ */
+
+} // end thrust
+
+#include <thrust/detail/host_vector.inl>
+
diff --git a/compat/thrust/inner_product.h b/compat/thrust/inner_product.h
new file mode 100644
index 0000000..01f5541
--- /dev/null
+++ b/compat/thrust/inner_product.h
@@ -0,0 +1,262 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file inner_product.h
+ *  \brief Mathematical inner product between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup transformed_reductions Transformed Reductions
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt>
+ *          and <tt>[first2, last2)</tt> plus \p init.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
+ *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
+ *          and is convertible to \p OutputType.
+ *
+ *  The following code demonstrates how to use \p inner_product to
+ *  compute the dot product of two vectors using the \p thrust::host execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, 0.0f);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType>
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
+ *
+ *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
+ *  this version offers no guarantee on order of execution.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt>
+ *          and <tt>[first2, last2)</tt> plus \p init.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
+ *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
+ *          and is convertible to \p OutputType.
+ *
+ *  The following code demonstrates how to use \p inner_product to
+ *  compute the dot product of two vectors.
+ *
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, 0.0f);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template <typename InputIterator1, typename InputIterator2, typename OutputType>
+OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
+                         InputIterator2 first2, OutputType init);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  This version of \p inner_product is identical to the first, except that is uses
+ *  two user-supplied function objects instead of \c operator+ and \c operator*.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \param binary_op1 Generalized addition operation.
+ *  \param binary_op2 Generalized multiplication operation.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
+ *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
+ *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
+ * 
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float init = 0.0f;
+ *  thrust::plus<float>       binary_op1;
+ *  thrust::multiplies<float> binary_op2;
+ *
+ *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputType,
+         typename BinaryFunction1,
+         typename BinaryFunction2>
+OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2);
+
+
+/*! \p inner_product calculates an inner product of the ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
+ *
+ *  This version of \p inner_product is identical to the first, except that is uses
+ *  two user-supplied function objects instead of \c operator+ and \c operator*.
+ *
+ *  Specifically, this version of \p inner_product computes the sum
+ *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
+ *
+ *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
+ *  this version offers no guarantee on order of execution.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1 The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param init Initial value of the result.
+ *  \param binary_op1 Generalized addition operation.
+ *  \param binary_op2 Generalized multiplication operation.
+ *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
+ *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
+ *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
+ * 
+ *  \code
+ *  #include <thrust/inner_product.h>
+ *  ...
+ *  float vec1[3] = {1.0f, 2.0f, 5.0f};
+ *  float vec2[3] = {4.0f, 1.0f, 5.0f};
+ *
+ *  float init = 0.0f;
+ *  thrust::plus<float>       binary_op1;
+ *  thrust::multiplies<float> binary_op2;
+ *
+ *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
+ *
+ *  // result == 31.0f
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/inner_product.html
+ */
+template <typename InputIterator1, typename InputIterator2, typename OutputType,
+          typename BinaryFunction1, typename BinaryFunction2>
+OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
+                         InputIterator2 first2, OutputType init, 
+                         BinaryFunction1 binary_op1, BinaryFunction2 binary_op2);
+
+
+/*! \} // end transformed_reductions
+ *  \} // end reductions
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/inner_product.inl>
+
diff --git a/compat/thrust/iterator/constant_iterator.h b/compat/thrust/iterator/constant_iterator.h
new file mode 100644
index 0000000..e9e03c1
--- /dev/null
+++ b/compat/thrust/iterator/constant_iterator.h
@@ -0,0 +1,251 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/constant_iterator.h
+ *  \brief An iterator which returns a constant value when
+ *         dereferenced
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/constant_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p constant_iterator is an iterator which represents a pointer into a range
+ *  of constant values. This iterator is useful for creating a range filled with the same
+ *  value without explicitly storing it in memory. Using \p constant_iterator saves both
+ *  memory capacity and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p constant_iterator whose
+ *  \c value_type is \c int and whose value is \c 10.
+ *
+ *  \code
+ *  #include <thrust/iterator/constant_iterator.h>
+ *
+ *  thrust::constant_iterator<int> iter(10);
+ *
+ *  *iter;    // returns 10
+ *  iter[0];  // returns 10
+ *  iter[1];  // returns 10
+ *  iter[13]; // returns 10
+ *
+ *  // and so on...
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p constant_iterator with the
+ *  \p thrust::transform function to increment all elements of a sequence by the
+ *  same value. We will create a temporary \p constant_iterator with the function
+ *  \p make_constant_iterator function in order to avoid explicitly specifying
+ *  its type:
+ *
+ *  \code
+ *  #include <thrust/iterator/constant_iterator.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> data(4);
+ *    data[0] = 3;
+ *    data[1] = 7;
+ *    data[2] = 2;
+ *    data[3] = 5;
+ *    
+ *    // add 10 to all values in data
+ *    thrust::transform(data.begin(), data.end(),
+ *                      thrust::make_constant_iterator(10),
+ *                      data.begin(),
+ *                      thrust::plus<int>());
+ *    
+ *    // data is now [13, 17, 12, 15]
+ *    
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_constant_iterator
+ */
+template<typename Value,
+         typename Incrementable = use_default,
+         typename System = use_default>
+  class constant_iterator
+    : public detail::constant_iterator_base<Value, Incrementable, System>::type
+{
+    /*! \cond
+     */
+    friend class thrust::iterator_core_access;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::type          super_t;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::incrementable incrementable;
+    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::base_iterator base_iterator;
+
+  public:
+    typedef typename super_t::reference  reference;
+    typedef typename super_t::value_type value_type;
+
+    /*! \endcond
+     */
+
+    /*! Null constructor initializes this \p constant_iterator's constant using its
+     *  null constructor.
+     */
+    __host__ __device__
+    constant_iterator(void)
+      : super_t(), m_value(){};
+
+    /*! Copy constructor copies the value of another \p constant_iterator into this
+     *  \p constant_iterator.
+     *
+     *  \p rhs The constant_iterator to copy.
+     */
+    __host__ __device__
+    constant_iterator(constant_iterator const &rhs)
+      : super_t(rhs.base()), m_value(rhs.m_value) {}
+
+    /*! Copy constructor copies the value of another \p constant_iterator with related
+     *  System type.
+     *
+     *  \param rhs The \p constant_iterator to copy.
+     */
+    template<typename OtherSystem>
+    __host__ __device__
+    constant_iterator(constant_iterator<Value,Incrementable,OtherSystem> const &rhs,
+                      typename thrust::detail::enable_if_convertible<
+                        typename thrust::iterator_system<constant_iterator<Value,Incrementable,OtherSystem> >::type,
+                        typename thrust::iterator_system<super_t>::type
+                      >::type * = 0)
+      : super_t(rhs.base()), m_value(rhs.value()) {}
+
+    /*! This constructor receives a value to use as the constant value of this
+     *  \p constant_iterator and an index specifying the location of this
+     *  \p constant_iterator in a sequence.
+     *  
+     *  \p v The value of this \p constant_iterator's constant value.
+     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    __host__ __device__
+    constant_iterator(value_type const& v, incrementable const &i = incrementable())
+      : super_t(base_iterator(i)), m_value(v) {}
+
+    /*! This constructor is templated to allow construction from a value type and
+     *  incrementable type related this this \p constant_iterator's respective types.
+     *
+     *  \p v The value of this \p constant_iterator's constant value.
+     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    template<typename OtherValue, typename OtherIncrementable>
+    __host__ __device__
+    constant_iterator(OtherValue const& v, OtherIncrementable const& i = incrementable())
+      : super_t(base_iterator(i)), m_value(v) {}
+
+    /*! This method returns the value of this \p constant_iterator's constant value.
+     *  \return A \c const reference to this \p constant_iterator's constant value.
+     */
+    __host__ __device__
+    Value const& value(void) const
+    { return m_value; }
+
+    /*! \cond
+     */
+
+  protected:
+    __host__ __device__
+    Value const& value_reference(void) const
+    { return m_value; }
+
+    __host__ __device__
+    Value & value_reference(void)
+    { return m_value; }
+  
+  private: // Core iterator interface
+    __host__ __device__
+    reference dereference(void) const
+    {
+      return m_value;
+    }
+
+  private:
+    Value m_value;
+
+    /*! \endcond
+     */
+}; // end constant_iterator
+
+
+/*! This version of \p make_constant_iterator creates a \p constant_iterator
+ *  from values given for both value and index. The type of \p constant_iterator
+ *  may be inferred by the compiler from the types of its parameters.
+ *
+ *  \param x The value of the returned \p constant_iterator's constant value.
+ *  \param i The index of the returned \p constant_iterator within a sequence.
+ *           The type of this parameter defaults to \c int. In the default case,
+ *           the value of this parameter is \c 0.
+ *
+ *  \return A new \p constant_iterator with constant value & index as given
+ *          by \p x & \p i.
+ *
+ *  \see constant_iterator
+ */
+template<typename V, typename I>
+inline __host__ __device__
+constant_iterator<V,I> make_constant_iterator(V x, I i = int())
+{
+  return constant_iterator<V,I>(x, i);
+} // end make_constant_iterator()
+
+
+/*! This version of \p make_constant_iterator creates a \p constant_iterator
+ *  using only a parameter for the desired constant value. The value of the
+ *  returned \p constant_iterator's index is set to \c 0.
+ *
+ *  \param x The value of the returned \p constant_iterator's constant value.
+ *  \return A new \p constant_iterator with constant value equal to \p x and
+ *          index equal to \c 0.
+ *  \see constant_iterator
+ */
+template<typename V>
+inline __host__ __device__
+constant_iterator<V> make_constant_iterator(V x)
+{
+  return constant_iterator<V>(x, 0);
+} // end make_constant_iterator()
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end namespace thrust
+
diff --git a/compat/thrust/iterator/counting_iterator.h b/compat/thrust/iterator/counting_iterator.h
new file mode 100644
index 0000000..99812ca
--- /dev/null
+++ b/compat/thrust/iterator/counting_iterator.h
@@ -0,0 +1,243 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/counting_iterator.h
+ *  \brief An iterator which returns an increasing incrementable value
+ *         when dereferenced
+ */
+
+/*
+ * Copyright David Abrahams 2003.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_categories.h>
+
+// #include the details first
+#include <thrust/iterator/detail/counting_iterator.inl>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p counting_iterator is an iterator which represents a pointer into a range
+ *  of sequentially changing values. This iterator is useful for creating a range
+ *  filled with a sequence without explicitly storing it in memory. Using
+ *  \p counting_iterator saves memory capacity and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p counting_iterator whose
+ *  \c value_type is \c int and which sequentially increments by \c 1.
+ *
+ *  \code
+ *  #include <thrust/iterator/counting_iterator.h>
+ *  ...
+ *  // create iterators
+ *  thrust::counting_iterator<int> first(10);
+ *  thrust::counting_iterator<int> last = first + 3;
+ *   
+ *  first[0]   // returns 10
+ *  first[1]   // returns 11
+ *  first[100] // returns 110
+ *   
+ *  // sum of [first, last)
+ *  thrust::reduce(first, last);   // returns 33 (i.e. 10 + 11 + 12)
+ *   
+ *  // initialize vector to [0,1,2,..]
+ *  thrust::counting_iterator<int> iter(0);
+ *  thrust::device_vector<int> vec(500);
+ *  thrust::copy(iter, iter + vec.size(), vec.begin());
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p counting_iterator with the
+ *  \p thrust::copy_if function to compute the indices of the non-zero elements
+ *  of a \p device_vector. In this example, we use the \p make_counting_iterator
+ *  function to avoid specifying the type of the \p counting_iterator.
+ *
+ *  \code
+ *  #include <thrust/iterator/counting_iterator.h>
+ *  #include <thrust/copy.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *   
+ *  int main(void)
+ *  {
+ *   // this example computes indices for all the nonzero values in a sequence
+ *   
+ *   // sequence of zero and nonzero values
+ *   thrust::device_vector<int> stencil(8);
+ *   stencil[0] = 0;
+ *   stencil[1] = 1;
+ *   stencil[2] = 1;
+ *   stencil[3] = 0;
+ *   stencil[4] = 0;
+ *   stencil[5] = 1;
+ *   stencil[6] = 0;
+ *   stencil[7] = 1;
+ *   
+ *   // storage for the nonzero indices
+ *   thrust::device_vector<int> indices(8);
+ *   
+ *   // compute indices of nonzero elements
+ *   typedef thrust::device_vector<int>::iterator IndexIterator;
+ *   
+ *   // use make_counting_iterator to define the sequence [0, 8)
+ *   IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
+ *                                               thrust::make_counting_iterator(8),
+ *                                               stencil.begin(),
+ *                                               indices.begin(),
+ *                                               thrust::identity<int>());
+ *   // indices now contains [1,2,5,7]
+ *   
+ *   return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_counting_iterator
+ */
+template<typename Incrementable,
+         typename System = use_default,
+         typename Traversal = use_default,
+         typename Difference = use_default>
+  class counting_iterator
+    : public detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type
+{
+    /*! \cond
+     */
+    typedef typename detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type super_t;
+
+    friend class thrust::iterator_core_access;
+
+  public:
+    typedef typename super_t::reference       reference;
+    typedef typename super_t::difference_type difference_type;
+
+    /*! \endcond
+     */
+
+    /*! Null constructor initializes this \p counting_iterator's \c Incrementable
+     *  counter using its null constructor.
+     */
+    __host__ __device__
+    counting_iterator(void){};
+
+    /*! Copy constructor copies the value of another \p counting_iterator into a
+     *  new \p counting_iterator.
+     *
+     *  \p rhs The \p counting_iterator to copy.
+     */
+    __host__ __device__
+    counting_iterator(counting_iterator const &rhs):super_t(rhs.base()){}
+
+    /*! Copy constructor copies the value of another counting_iterator 
+     *  with related System type.
+     *
+     *  \param rhs The \p counting_iterator to copy.
+     */
+    template<typename OtherSystem>
+    __host__ __device__
+    counting_iterator(counting_iterator<Incrementable, OtherSystem, Traversal, Difference> const &rhs,
+                      typename thrust::detail::enable_if_convertible<
+                        typename thrust::iterator_system<counting_iterator<Incrementable,OtherSystem,Traversal,Difference> >::type,
+                        typename thrust::iterator_system<super_t>::type
+                      >::type * = 0)
+      : super_t(rhs.base()){}
+
+    /*! This \c explicit constructor copies the value of an \c Incrementable
+     *  into a new \p counting_iterator's \c Incrementable counter.
+     *  
+     *  \param x The initial value of the new \p counting_iterator's \c Incrementable
+     *         counter.
+     */
+    __host__ __device__
+    explicit counting_iterator(Incrementable x):super_t(x){}
+
+    /*! \cond
+     */
+  private:
+    __host__ __device__
+    reference dereference(void) const
+    {
+      return this->base_reference();
+    }
+
+    // note that we implement equal specially for floating point counting_iterator
+    template <typename OtherIncrementable, typename OtherSystem, typename OtherTraversal, typename OtherDifference>
+    __host__ __device__
+    bool equal(counting_iterator<OtherIncrementable, OtherSystem, OtherTraversal, OtherDifference> const& y) const
+    {
+      typedef thrust::detail::counting_iterator_equal<difference_type,Incrementable,OtherIncrementable> e;
+      return e::equal(this->base(), y.base());
+    }
+
+    template <class OtherIncrementable>
+    __host__ __device__
+    difference_type
+    distance_to(counting_iterator<OtherIncrementable, System, Traversal, Difference> const& y) const
+    {
+      typedef typename
+      thrust::detail::eval_if<
+        thrust::detail::is_numeric<Incrementable>::value,
+        thrust::detail::identity_<thrust::detail::number_distance<difference_type, Incrementable, OtherIncrementable> >,
+        thrust::detail::identity_<thrust::detail::iterator_distance<difference_type, Incrementable, OtherIncrementable> >
+      >::type d;
+
+      return d::distance(this->base(), y.base());
+    }
+
+    /*! \endcond
+     */
+}; // end counting_iterator
+
+
+/*! \p make_counting_iterator creates a \p counting_iterator
+ *  using an initial value for its \c Incrementable counter.
+ *
+ *  \param x The initial value of the new \p counting_iterator's counter.
+ *  \return A new \p counting_iterator whose counter has been initialized to \p x.
+ */
+template <typename Incrementable>
+inline __host__ __device__
+counting_iterator<Incrementable> make_counting_iterator(Incrementable x)
+{
+  return counting_iterator<Incrementable>(x);
+}
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/any_assign.h b/compat/thrust/iterator/detail/any_assign.h
new file mode 100644
index 0000000..e08a829
--- /dev/null
+++ b/compat/thrust/iterator/detail/any_assign.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// a type which may be assigned any other type
+struct any_assign
+{
+  inline __host__ __device__ any_assign(void)
+  {}
+
+  template<typename T>
+  inline __host__ __device__ any_assign(T)
+  {}
+
+  template<typename T>
+  inline __host__ __device__
+  any_assign &operator=(T)
+  {
+    if(0)
+    {
+      // trick the compiler into silencing "warning: this expression has no effect"
+      int *x = 0;
+      *x = 13;
+    } // end if
+
+    return *this;
+  }
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/any_system_tag.h b/compat/thrust/iterator/detail/any_system_tag.h
new file mode 100644
index 0000000..fc6417a
--- /dev/null
+++ b/compat/thrust/iterator/detail/any_system_tag.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+struct any_system_tag
+  : thrust::execution_policy<any_system_tag>
+{
+  // allow any_system_tag to convert to any type at all
+  // XXX make this safer using enable_if<is_tag<T>> upon c++11
+  template<typename T> operator T () const {return T();}
+};
+
+// TODO remove this in 1.7.0
+typedef THRUST_DEPRECATED any_system_tag any_space_tag;
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/constant_iterator_base.h b/compat/thrust/iterator/detail/constant_iterator_base.h
new file mode 100644
index 0000000..276e5ff
--- /dev/null
+++ b/compat/thrust/iterator/detail/constant_iterator_base.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace thrust
+{
+
+// forward declaration of constant_iterator
+template<typename,typename,typename> class constant_iterator;
+
+namespace detail
+{
+
+template<typename Value,
+         typename Incrementable,
+         typename System>
+  struct constant_iterator_base
+{
+  typedef Value              value_type;
+
+  // the reference type is the same as the value_type.
+  // we wish to avoid returning a reference to the internal state
+  // of the constant_iterator, which is prone to subtle bugs.
+  // consider the temporary iterator created in the expression
+  // *(iter + i)
+  typedef value_type         reference;
+
+  // the incrementable type is int unless otherwise specified
+  typedef typename thrust::detail::ia_dflt_help<
+    Incrementable,
+    thrust::detail::identity_<int>
+  >::type incrementable;
+
+  typedef typename thrust::counting_iterator<
+    incrementable,
+    System,
+    thrust::random_access_traversal_tag
+  > base_iterator;
+
+  typedef typename thrust::iterator_adaptor<
+    constant_iterator<Value, Incrementable, System>,
+    base_iterator,
+    value_type, // XXX we may need to pass const value_type here as boost counting_iterator does
+    typename thrust::iterator_system<base_iterator>::type,
+    typename thrust::iterator_traversal<base_iterator>::type,
+    reference
+  > type;
+}; // end constant_iterator_base
+
+} // end detail
+  
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/counting_iterator.inl b/compat/thrust/iterator/detail/counting_iterator.inl
new file mode 100644
index 0000000..ad4fcff
--- /dev/null
+++ b/compat/thrust/iterator/detail/counting_iterator.inl
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/numeric_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <cstddef>
+
+namespace thrust
+{
+
+// forward declaration of counting_iterator
+template <typename Incrementable, typename System, typename Traversal, typename Difference>
+  class counting_iterator;
+
+namespace detail
+{
+
+template <typename Incrementable, typename System, typename Traversal, typename Difference>
+  struct counting_iterator_base
+{
+  typedef typename thrust::detail::eval_if<
+    // use any_system_tag if we are given use_default
+    thrust::detail::is_same<System,use_default>::value,
+    thrust::detail::identity_<thrust::any_system_tag>,
+    thrust::detail::identity_<System>
+  >::type system;
+
+  typedef typename thrust::detail::ia_dflt_help<
+      Traversal,
+      thrust::detail::eval_if<
+          thrust::detail::is_numeric<Incrementable>::value,
+          thrust::detail::identity_<random_access_traversal_tag>,
+          thrust::iterator_traversal<Incrementable>
+      >
+  >::type traversal;
+
+  // unlike Boost, we explicitly use std::ptrdiff_t as the difference type
+  // for floating point counting_iterators
+  typedef typename thrust::detail::ia_dflt_help<
+    Difference,
+    thrust::detail::eval_if<
+      thrust::detail::is_numeric<Incrementable>::value,
+        thrust::detail::eval_if<
+          thrust::detail::is_integral<Incrementable>::value,
+          thrust::detail::numeric_difference<Incrementable>,
+          thrust::detail::identity_<std::ptrdiff_t>
+        >,
+      thrust::iterator_difference<Incrementable>
+    >
+  >::type difference;
+
+  // our implementation departs from Boost's in that counting_iterator::dereference
+  // returns a copy of its counter, rather than a reference to it. returning a reference
+  // to the internal state of an iterator causes subtle bugs (consider the temporary
+  // iterator created in the expression *(iter + i) ) and has no compelling use case
+  typedef thrust::iterator_adaptor<
+    counting_iterator<Incrementable, System, Traversal, Difference>, // self
+    Incrementable,                                                  // Base
+    Incrementable,                                                  // XXX we may need to pass const here as Boost does
+    system,
+    traversal,
+    Incrementable,
+    difference
+  > type;
+}; // end counting_iterator_base
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct iterator_distance
+{
+  __host__ __device__
+  static Difference distance(Incrementable1 x, Incrementable2 y)
+  {
+    return y - x;
+  }
+};
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct number_distance
+{
+  __host__ __device__
+  static Difference distance(Incrementable1 x, Incrementable2 y)
+  {
+      return static_cast<Difference>(numeric_distance(x,y));
+  }
+};
+
+
+template<typename Difference, typename Incrementable1, typename Incrementable2, typename Enable = void>
+  struct counting_iterator_equal
+{
+  __host__ __device__
+  static bool equal(Incrementable1 x, Incrementable2 y)
+  {
+    return x == y;
+  }
+};
+
+
+// specialization for floating point equality
+template<typename Difference, typename Incrementable1, typename Incrementable2>
+  struct counting_iterator_equal<
+    Difference,
+    Incrementable1,
+    Incrementable2,
+    typename thrust::detail::enable_if<
+      thrust::detail::is_floating_point<Incrementable1>::value ||
+      thrust::detail::is_floating_point<Incrementable2>::value
+    >::type
+  >
+{
+  __host__ __device__
+  static bool equal(Incrementable1 x, Incrementable2 y)
+  {
+    typedef number_distance<Difference,Incrementable1,Incrementable2> d;
+    return d::distance(x,y) == 0;
+  }
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/device_system_tag.h b/compat/thrust/iterator/detail/device_system_tag.h
new file mode 100644
index 0000000..ab66fb4
--- /dev/null
+++ b/compat/thrust/iterator/detail/device_system_tag.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's execution_policy header
+#define __THRUST_DEVICE_SYSTEM_TAG_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/execution_policy.h>
+#include __THRUST_DEVICE_SYSTEM_TAG_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TAG_HEADER
+
+namespace thrust
+{
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag device_system_tag;
+
+} // end thrust
+
+// TODO remove this in 1.8.0
+namespace thrust
+{
+
+typedef THRUST_DEPRECATED device_system_tag device_space_tag;
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/discard_iterator_base.h b/compat/thrust/iterator/detail/discard_iterator_base.h
new file mode 100644
index 0000000..1909ca8
--- /dev/null
+++ b/compat/thrust/iterator/detail/discard_iterator_base.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/detail/any_assign.h>
+#include <cstddef> // for std::ptrdiff_t
+
+namespace thrust
+{
+
+// forward declaration of discard_iterator
+template<typename> class discard_iterator;
+
+namespace detail
+{
+
+
+template<typename System>
+  struct discard_iterator_base
+{
+  // XXX value_type should actually be void
+  //     but this interferes with zip_iterator<discard_iterator>
+  typedef any_assign         value_type;
+  typedef any_assign&        reference;
+  typedef std::ptrdiff_t     incrementable;
+
+  typedef typename thrust::counting_iterator<
+    incrementable,
+    System,
+    thrust::random_access_traversal_tag
+  > base_iterator;
+
+  typedef typename thrust::iterator_adaptor<
+    discard_iterator<System>,
+    base_iterator,
+    value_type,
+    typename thrust::iterator_system<base_iterator>::type,
+    typename thrust::iterator_traversal<base_iterator>::type,
+    reference
+  > type;
+}; // end discard_iterator_base
+
+
+} // end detail
+  
+} // end thrust
+
+
diff --git a/compat/thrust/iterator/detail/distance_from_result.h b/compat/thrust/iterator/detail/distance_from_result.h
new file mode 100644
index 0000000..bf83e6c
--- /dev/null
+++ b/compat/thrust/iterator/detail/distance_from_result.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+// since both arguments are known to be specializations of iterator_facade,
+// it's legal to access IteratorFacade2::difference_type
+template<typename IteratorFacade1, typename IteratorFacade2>
+  struct distance_from_result
+    : eval_if<
+        is_convertible<IteratorFacade2,IteratorFacade1>::value,
+        identity_<typename IteratorFacade1::difference_type>,
+        identity_<typename IteratorFacade2::difference_type>
+      >
+{};
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/host_system_tag.h b/compat/thrust/iterator/detail/host_system_tag.h
new file mode 100644
index 0000000..26d3f7d
--- /dev/null
+++ b/compat/thrust/iterator/detail/host_system_tag.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the host system's execution_policy header
+#define __THRUST_HOST_SYSTEM_TAG_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/execution_policy.h>
+#include __THRUST_HOST_SYSTEM_TAG_HEADER
+#undef __THRUST_HOST_SYSTEM_TAG_HEADER
+
+namespace thrust
+{
+
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag host_system_tag;
+
+} // end thrust
+
+// TODO remove this in 1.8.0
+namespace thrust
+{
+
+typedef THRUST_DEPRECATED host_system_tag host_space_tag;
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/is_iterator_category.h b/compat/thrust/iterator/detail/is_iterator_category.h
new file mode 100644
index 0000000..95f14d5
--- /dev/null
+++ b/compat/thrust/iterator/detail/is_iterator_category.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template <typename T>
+  struct is_host_iterator_category
+    : thrust::detail::or_<
+        thrust::detail::is_convertible<T, thrust::input_host_iterator_tag>,
+        thrust::detail::is_convertible<T, thrust::output_host_iterator_tag>
+      >
+{
+}; // end is_host_iterator_category
+
+template <typename T>
+  struct is_device_iterator_category
+    : thrust::detail::or_<
+        thrust::detail::is_convertible<T, thrust::input_device_iterator_tag>,
+        thrust::detail::is_convertible<T, thrust::output_device_iterator_tag>
+      >
+{
+}; // end is_device_iterator_category
+
+
+template <typename T>
+  struct is_iterator_category
+    : thrust::detail::or_<
+        is_host_iterator_category<T>,
+        is_device_iterator_category<T>
+      >
+{
+}; // end is_iterator_category
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/is_trivial_iterator.h b/compat/thrust/iterator/detail/is_trivial_iterator.h
new file mode 100644
index 0000000..ca37e74
--- /dev/null
+++ b/compat/thrust/iterator/detail/is_trivial_iterator.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#if __GNUC__
+// forward declaration of gnu's __normal_iterator
+namespace __gnu_cxx
+{
+
+template<typename Iterator, typename Container> class __normal_iterator;
+
+} // end __gnu_cxx
+#endif // __GNUC__
+
+#if _MSC_VER
+// forward declaration of MSVC's "normal iterators"
+namespace std
+{
+
+template<typename Value, typename Difference, typename Pointer, typename Reference> struct _Ranit;
+
+} // end std
+#endif // _MSC_VER
+
+namespace thrust
+{
+namespace detail
+{
+
+#ifdef __GNUC__
+template<typename T>
+  struct is_gnu_normal_iterator
+    : false_type
+{};
+
+
+// catch gnu __normal_iterators
+template<typename Iterator, typename Container>
+  struct is_gnu_normal_iterator< __gnu_cxx::__normal_iterator<Iterator, Container> >
+    : true_type
+{};
+#endif // __GNUC__
+
+
+#ifdef _MSC_VER
+// catch msvc _Ranit
+template<typename Iterator>
+  struct is_convertible_to_msvc_Ranit :
+    is_convertible<
+      Iterator,
+      std::_Ranit<
+        typename iterator_value<Iterator>::type,
+        typename iterator_difference<Iterator>::type,
+        typename iterator_pointer<Iterator>::type,
+        typename iterator_reference<Iterator>::type
+      >
+    >
+{};
+#endif // _MSC_VER
+
+
+template<typename T>
+  struct is_trivial_iterator :
+    integral_constant<
+      bool,
+        is_pointer<T>::value
+      | thrust::detail::is_thrust_pointer<T>::value
+#if __GNUC__
+      | is_gnu_normal_iterator<T>::value
+#endif // __GNUC__
+#ifdef _MSC_VER
+      | is_convertible_to_msvc_Ranit<T>::value
+#endif // _MSC_VER
+    >
+{};
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_adaptor_base.h b/compat/thrust/iterator/detail/iterator_adaptor_base.h
new file mode 100644
index 0000000..8b77f05
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_adaptor_base.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+
+// forward declaration of iterator_adaptor for iterator_adaptor_base below
+template<typename Derived,
+         typename Base,
+         typename Value,
+         typename System,
+         typename Traversal,
+         typename Reference,
+         typename Difference
+>
+class iterator_adaptor;
+
+
+namespace detail
+{
+
+// If T is use_default, return the result of invoking
+// DefaultNullaryFn, otherwise return T.
+// XXX rename to dflt_help
+template <class T, class DefaultNullaryFn>
+struct ia_dflt_help
+  : thrust::detail::eval_if<
+        thrust::detail::is_same<T, thrust::use_default>::value
+      , DefaultNullaryFn
+      , thrust::detail::identity_<T>
+    >
+{
+}; // end ia_dflt_help
+
+
+// A metafunction which computes an iterator_adaptor's base class,
+// a specialization of iterator_facade.
+template<typename Derived,
+         typename Base,
+         typename Value,
+         typename System,
+         typename Traversal,
+         typename Reference,
+         typename Difference
+>
+  struct iterator_adaptor_base
+{
+  typedef typename ia_dflt_help<
+    Value,
+    iterator_value<Base>
+  >::type value;
+
+  typedef typename ia_dflt_help<
+    System,
+    thrust::iterator_system<Base>
+  >::type system;
+
+  typedef typename ia_dflt_help<
+    Traversal,
+    thrust::iterator_traversal<Base>
+  >::type traversal;
+
+  typedef typename ia_dflt_help<
+    Reference,
+    thrust::detail::eval_if<
+      thrust::detail::is_same<Value,use_default>::value,
+      thrust::iterator_reference<Base>,
+      thrust::detail::add_reference<Value>
+    >
+  >::type reference;
+
+  typedef typename ia_dflt_help<
+    Difference,
+    iterator_difference<Base>
+  >::type difference;
+
+  typedef thrust::iterator_facade<
+    Derived,
+    value,
+    system,
+    traversal,
+    reference,
+    difference
+  > type;
+}; // end iterator_adaptor_base
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_category_to_system.h b/compat/thrust/iterator/detail/iterator_category_to_system.h
new file mode 100644
index 0000000..17e7d78
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_category_to_system.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+// XXX WAR circular #inclusion with forward declarations
+struct random_access_universal_iterator_tag;
+struct input_universal_iterator_tag;
+struct output_universal_iterator_tag;
+
+namespace detail
+{
+
+// forward declaration
+template <typename> struct is_iterator_system;
+
+template <typename> struct device_iterator_category_to_backend_system;
+
+// XXX this should work entirely differently
+// we should just specialize this metafunction for iterator_category_with_system_and_traversal
+template<typename Category>
+  struct iterator_category_to_system
+    // convertible to any iterator?
+    : eval_if<
+        or_<
+          is_convertible<Category, thrust::input_universal_iterator_tag>,
+          is_convertible<Category, thrust::output_universal_iterator_tag>
+        >::value,
+
+        detail::identity_<thrust::any_system_tag>,
+
+        // convertible to host iterator?
+        eval_if<
+          or_<
+            is_convertible<Category, thrust::input_host_iterator_tag>,
+            is_convertible<Category, thrust::output_host_iterator_tag>
+          >::value,
+
+          detail::identity_<thrust::host_system_tag>,
+          
+          // convertible to device iterator?
+          eval_if<
+            or_<
+              is_convertible<Category, thrust::input_device_iterator_tag>,
+              is_convertible<Category, thrust::output_device_iterator_tag>
+            >::value,
+
+            detail::identity_<thrust::device_system_tag>,
+
+            // unknown system
+            detail::identity_<void>
+          > // if device
+        > // if host
+      > // if any
+{
+}; // end iterator_category_to_system
+
+
+template<typename CategoryOrTraversal>
+  struct iterator_category_or_traversal_to_system
+    : eval_if<
+        is_iterator_system<CategoryOrTraversal>::value,
+        detail::identity_<CategoryOrTraversal>,
+        iterator_category_to_system<CategoryOrTraversal>
+      >
+{
+}; // end iterator_category_or_traversal_to_system
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_category_to_traversal.h b/compat/thrust/iterator/detail/iterator_category_to_traversal.h
new file mode 100644
index 0000000..04ef60c
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_category_to_traversal.h
@@ -0,0 +1,178 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/iterator_category_to_system.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+// XXX WAR circular #inclusion with these forward declarations
+struct bidirectional_universal_iterator_tag;
+struct forward_universal_iterator_tag;
+
+namespace detail
+{
+
+// forward declarations
+template <typename> struct is_iterator_system;
+template <typename> struct is_iterator_traversal;
+
+// make type_traits easy to access
+using namespace thrust::detail;
+
+template <typename Category>
+  struct host_system_category_to_traversal
+    : eval_if<
+        is_convertible<Category, random_access_host_iterator_tag>::value,
+        detail::identity_<random_access_traversal_tag>,
+        eval_if<
+          is_convertible<Category, bidirectional_host_iterator_tag>::value,
+          detail::identity_<bidirectional_traversal_tag>,
+          eval_if<
+            is_convertible<Category, forward_host_iterator_tag>::value,
+            detail::identity_<forward_traversal_tag>,
+            eval_if<
+              is_convertible<Category, input_host_iterator_tag>::value,
+              detail::identity_<single_pass_traversal_tag>,
+              eval_if<
+                is_convertible<Category, output_host_iterator_tag>::value,
+                detail::identity_<incrementable_traversal_tag>,
+                void
+              >
+            >
+          >
+        >
+      >
+{
+}; // end host_system_category_to_traversal
+
+
+
+template <typename Category>
+  struct device_system_category_to_traversal
+    : eval_if<
+        is_convertible<Category, random_access_device_iterator_tag>::value,
+        detail::identity_<random_access_traversal_tag>,
+        eval_if<
+          is_convertible<Category, bidirectional_device_iterator_tag>::value,
+          detail::identity_<bidirectional_traversal_tag>,
+          eval_if<
+            is_convertible<Category, forward_device_iterator_tag>::value,
+            detail::identity_<forward_traversal_tag>,
+            eval_if<
+              is_convertible<Category, input_device_iterator_tag>::value,
+              detail::identity_<single_pass_traversal_tag>,
+              eval_if<
+                is_convertible<Category, output_device_iterator_tag>::value,
+                detail::identity_<incrementable_traversal_tag>,
+                void
+              >
+            >
+          >
+        >
+      >
+{
+}; // end device_system_category_to_traversal
+
+
+
+template <typename Category>
+  struct any_system_category_to_traversal
+    : eval_if<
+        is_convertible<Category, random_access_universal_iterator_tag>::value,
+        identity_<random_access_traversal_tag>,
+        eval_if<
+          is_convertible<Category, bidirectional_universal_iterator_tag>::value,
+          identity_<bidirectional_traversal_tag>,
+          eval_if<
+            is_convertible<Category, forward_universal_iterator_tag>::value,
+            identity_<forward_traversal_tag>,
+            eval_if<
+              is_convertible<Category, input_universal_iterator_tag>::value,
+              identity_<single_pass_traversal_tag>,
+              eval_if<
+                is_convertible<Category, output_universal_iterator_tag>::value,
+                identity_<incrementable_traversal_tag>,
+
+                // unknown traversal
+                void
+              >
+            >
+          >
+        >
+      >
+{
+}; // end any_system_category_to_traversal
+
+
+template<typename Category>
+  struct category_to_traversal
+      // check for any system
+    : eval_if<
+        or_<
+          is_convertible<Category, thrust::input_universal_iterator_tag>,
+          is_convertible<Category, thrust::output_universal_iterator_tag>
+        >::value,
+
+        any_system_category_to_traversal<Category>,
+
+        // check for host system
+        eval_if<
+          or_<
+            is_convertible<Category, thrust::input_host_iterator_tag>,
+            is_convertible<Category, thrust::output_host_iterator_tag>
+          >::value,
+
+          host_system_category_to_traversal<Category>,
+
+          // check for device system
+          eval_if<
+            or_<
+              is_convertible<Category, thrust::input_device_iterator_tag>,
+              is_convertible<Category, thrust::output_device_iterator_tag>
+            >::value,
+
+            device_system_category_to_traversal<Category>,
+
+            // unknown category
+            void
+          >
+        >
+      >
+{};
+
+
+template <typename CategoryOrTraversal>
+  struct iterator_category_to_traversal
+    : eval_if<
+        is_iterator_traversal<CategoryOrTraversal>::value,
+        detail::identity_<CategoryOrTraversal>,
+        category_to_traversal<CategoryOrTraversal>
+      >
+{
+}; // end iterator_category_to_traversal
+
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_facade_category.h b/compat/thrust/iterator/detail/iterator_facade_category.h
new file mode 100644
index 0000000..fbb8bd6
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_facade_category.h
@@ -0,0 +1,283 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/is_iterator_category.h>
+#include <thrust/iterator/detail/iterator_category_to_traversal.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+template<typename Category, typename System, typename Traversal>
+  struct iterator_category_with_system_and_traversal
+    : Category
+{
+}; // end iterator_category_with_system_and_traversal
+
+// specialize iterator_category_to_system for iterator_category_with_system_and_traversal
+template<typename Category> struct iterator_category_to_system;
+
+template<typename Category, typename System, typename Traversal>
+  struct iterator_category_to_system<iterator_category_with_system_and_traversal<Category,System,Traversal> >
+{
+  typedef System type;
+}; // end iterator_category_with_system_and_traversal
+
+
+// adapted from http://www.boost.org/doc/libs/1_37_0/libs/iterator/doc/iterator_facade.html#iterator-category
+//
+// in our implementation, R need not be a reference type to result in a category
+// derived from forward_XXX_iterator_tag
+//
+// iterator-category(T,V,R) :=
+//   if(T is convertible to input_host_iterator_tag
+//      || T is convertible to output_host_iterator_tag
+//      || T is convertible to input_device_iterator_tag
+//      || T is convertible to output_device_iterator_tag
+//   )
+//     return T
+//
+//   else if (T is not convertible to incrementable_traversal_tag)
+//     the program is ill-formed
+//
+//   else return a type X satisfying the following two constraints:
+//
+//     1. X is convertible to X1, and not to any more-derived
+//        type, where X1 is defined by:
+//
+//        if (T is convertible to forward_traversal_tag)
+//        {
+//          if (T is convertible to random_access_traversal_tag)
+//            X1 = random_access_host_iterator_tag
+//          else if (T is convertible to bidirectional_traversal_tag)
+//            X1 = bidirectional_host_iterator_tag
+//          else
+//            X1 = forward_host_iterator_tag
+//        }
+//        else
+//        {
+//          if (T is convertible to single_pass_traversal_tag
+//              && R is convertible to V)
+//            X1 = input_host_iterator_tag
+//          else
+//            X1 = T
+//        }
+//
+//     2. category-to-traversal(X) is convertible to the most
+//        derived traversal tag type to which X is also convertible,
+//        and not to any more-derived traversal tag type.
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category;
+
+
+// Thrust's implementation of iterator_facade_default_category is slightly
+// different from Boost's equivalent.
+// Thrust does not check is_convertible<Reference, ValueParam> because Reference
+// may not be a complete type at this point, and implementations of is_convertible
+// typically require that both types be complete.
+// Instead, it simply assumes that if is_convertible<Traversal, single_pass_traversal_tag>,
+// then the category is input_iterator_tag
+
+
+// this is the function for standard system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_std :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<std::random_access_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<std::bidirectional_iterator_tag>,
+          thrust::detail::identity_<std::forward_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if< // XXX note we differ from Boost here
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
+        thrust::detail::identity_<std::input_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_std
+
+
+// this is the function for host system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_host :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<thrust::random_access_host_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<thrust::bidirectional_host_iterator_tag>,
+          thrust::detail::identity_<thrust::forward_host_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if< // XXX note we differ from Boost here
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
+        thrust::detail::identity_<thrust::input_host_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_host
+
+
+// this is the function for device system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_device :
+    thrust::detail::eval_if<
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<thrust::random_access_device_iterator_tag>,
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<thrust::bidirectional_device_iterator_tag>,
+          thrust::detail::identity_<thrust::forward_device_iterator_tag>
+        >
+      >,
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value, // XXX note we differ from Boost here
+        thrust::detail::identity_<thrust::input_device_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_device
+
+
+// this is the function for any system iterators
+template<typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category_any :
+    thrust::detail::eval_if<
+
+      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
+
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
+        thrust::detail::identity_<thrust::random_access_universal_iterator_tag>,
+
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
+          thrust::detail::identity_<thrust::bidirectional_universal_iterator_tag>,
+          thrust::detail::identity_<thrust::forward_universal_iterator_tag>
+        >
+      >,
+
+      thrust::detail::eval_if<
+        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value, // XXX note we differ from Boost here
+        thrust::detail::identity_<thrust::input_universal_iterator_tag>,
+        thrust::detail::identity_<Traversal>
+      >
+    >
+{
+}; // end iterator_facade_default_category_any
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_default_category
+      // check for any system
+    : thrust::detail::eval_if<
+        thrust::detail::is_convertible<System, thrust::any_system_tag>::value,
+        iterator_facade_default_category_any<Traversal, ValueParam, Reference>,
+
+        // check for host system
+        thrust::detail::eval_if<
+          thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
+          iterator_facade_default_category_host<Traversal, ValueParam, Reference>,
+
+          // check for device system
+          thrust::detail::eval_if<
+            thrust::detail::is_convertible<System, thrust::device_system_tag>::value,
+            iterator_facade_default_category_device<Traversal, ValueParam, Reference>,
+
+            // if we don't recognize the system, get a standard iterator category
+            // and combine it with System & Traversal
+            thrust::detail::identity_<
+              thrust::detail::iterator_category_with_system_and_traversal<
+                typename iterator_facade_default_category_std<Traversal, ValueParam, Reference>::type,
+                System,
+                Traversal
+              >
+            >
+          >
+        >
+      >
+{};
+
+
+template<typename System, typename Traversal, typename ValueParam, typename Reference>
+  struct iterator_facade_category_impl
+{
+  typedef typename iterator_facade_default_category<
+    System,Traversal,ValueParam,Reference
+  >::type category;
+
+  // we must be able to deduce both Traversal & System from category
+  // otherwise, munge them all together
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::and_<
+      thrust::detail::is_same<
+        Traversal,
+        typename thrust::detail::iterator_category_to_traversal<category>::type
+      >,
+      thrust::detail::is_same<
+        System,
+        typename thrust::detail::iterator_category_to_system<category>::type
+      >
+    >::value,
+    thrust::detail::identity_<category>,
+    thrust::detail::identity_<thrust::detail::iterator_category_with_system_and_traversal<category,System,Traversal> >
+  >::type type;
+}; // end iterator_facade_category_impl
+
+
+template<typename CategoryOrSystem,
+         typename CategoryOrTraversal,
+         typename ValueParam,
+         typename Reference>
+  struct iterator_facade_category
+{
+  typedef typename
+  thrust::detail::eval_if<
+    thrust::detail::is_iterator_category<CategoryOrTraversal>::value,
+    thrust::detail::identity_<CategoryOrTraversal>, // categories are fine as-is
+    iterator_facade_category_impl<CategoryOrSystem, CategoryOrTraversal, ValueParam, Reference>
+  >::type type;
+}; // end iterator_facade_category
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_traits.inl b/compat/thrust/iterator/detail/iterator_traits.inl
new file mode 100644
index 0000000..924eabb
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_traits.inl
@@ -0,0 +1,112 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file iterator_traits.inl
+ *  \brief Inline file for iterator_traits.h.
+ */
+
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/iterator_category_to_traversal.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+template<typename Iterator>
+  struct iterator_value
+{
+  typedef typename thrust::iterator_traits<Iterator>::value_type type;
+}; // end iterator_value
+
+
+template<typename Iterator>
+  struct iterator_pointer
+{
+  typedef typename thrust::iterator_traits<Iterator>::pointer type;
+}; // end iterator_pointer
+
+
+template<typename Iterator>
+  struct iterator_reference
+{
+  typedef typename iterator_traits<Iterator>::reference type;
+}; // end iterator_reference
+
+
+template<typename Iterator>
+  struct iterator_difference
+{
+  typedef typename thrust::iterator_traits<Iterator>::difference_type type;
+}; // end iterator_difference
+
+
+template<typename Iterator>
+  struct iterator_system
+    : detail::iterator_category_to_system<
+        typename thrust::iterator_traits<Iterator>::iterator_category
+      >
+{
+}; // end iterator_system
+
+// specialize iterator_system for void *, which has no category
+template<>
+  struct iterator_system<void *>
+{
+  typedef thrust::iterator_system<int*>::type type;
+}; // end iterator_system<void*>
+
+template<>
+  struct iterator_system<const void *>
+{
+  typedef thrust::iterator_system<const int*>::type type;
+}; // end iterator_system<void*>
+
+
+template <typename Iterator>
+  struct iterator_traversal
+    : detail::iterator_category_to_traversal<
+        typename thrust::iterator_traits<Iterator>::iterator_category
+      >
+{
+}; // end iterator_traversal
+
+namespace detail
+{
+
+template <typename T>
+  struct is_iterator_traversal
+    : thrust::detail::is_convertible<T, incrementable_traversal_tag>
+{
+}; // end is_iterator_traversal
+
+
+template<typename T>
+  struct is_iterator_system
+    : detail::or_<
+        detail::is_convertible<T, any_system_tag>,
+        detail::or_<
+          detail::is_convertible<T, host_system_tag>,
+          detail::is_convertible<T, device_system_tag>
+        >
+      >
+{
+}; // end is_iterator_system
+
+
+} // end namespace detail
+} // end namespace thrust
+
diff --git a/compat/thrust/iterator/detail/iterator_traversal_tags.h b/compat/thrust/iterator/detail/iterator_traversal_tags.h
new file mode 100644
index 0000000..dcbebf3
--- /dev/null
+++ b/compat/thrust/iterator/detail/iterator_traversal_tags.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+// define Boost's traversal tags
+struct no_traversal_tag {};
+
+struct incrementable_traversal_tag
+  : no_traversal_tag {};
+
+struct single_pass_traversal_tag
+  : incrementable_traversal_tag {};
+
+struct forward_traversal_tag
+  : single_pass_traversal_tag {};
+
+struct bidirectional_traversal_tag
+  : forward_traversal_tag {};
+
+struct random_access_traversal_tag
+  : bidirectional_traversal_tag {};
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/minimum_category.h b/compat/thrust/iterator/detail/minimum_category.h
new file mode 100644
index 0000000..e07e096
--- /dev/null
+++ b/compat/thrust/iterator/detail/minimum_category.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/minimum_type.h>
+
+namespace thrust
+{
+
+namespace detail
+{ 
+
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_category
+    : minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
+{
+}; // end minimum_category
+
+} // end detail
+
+} // end thrust
+
+
diff --git a/compat/thrust/iterator/detail/minimum_system.h b/compat/thrust/iterator/detail/minimum_system.h
new file mode 100644
index 0000000..5448a0d
--- /dev/null
+++ b/compat/thrust/iterator/detail/minimum_system.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits/minimum_type.h>
+
+namespace thrust
+{
+namespace detail
+{ 
+
+template<typename T1,
+         typename T2  = minimum_type_detail::any_conversion,
+         typename T3  = minimum_type_detail::any_conversion,
+         typename T4  = minimum_type_detail::any_conversion,
+         typename T5  = minimum_type_detail::any_conversion,
+         typename T6  = minimum_type_detail::any_conversion,
+         typename T7  = minimum_type_detail::any_conversion,
+         typename T8  = minimum_type_detail::any_conversion,
+         typename T9  = minimum_type_detail::any_conversion,
+         typename T10 = minimum_type_detail::any_conversion,
+         typename T11 = minimum_type_detail::any_conversion,
+         typename T12 = minimum_type_detail::any_conversion,
+         typename T13 = minimum_type_detail::any_conversion,
+         typename T14 = minimum_type_detail::any_conversion,
+         typename T15 = minimum_type_detail::any_conversion,
+         typename T16 = minimum_type_detail::any_conversion>
+  struct minimum_system
+    : minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
+{
+}; // end minimum_system
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/normal_iterator.h b/compat/thrust/iterator/detail/normal_iterator.h
new file mode 100644
index 0000000..7fe61bf
--- /dev/null
+++ b/compat/thrust/iterator/detail/normal_iterator.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file normal_iterator.h
+ *  \brief Defines the interface to an iterator class
+ *         which adapts a pointer type.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/detail/is_trivial_iterator.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+template<typename Pointer>
+  class normal_iterator
+    : public iterator_adaptor<
+        normal_iterator<Pointer>,
+        Pointer
+      >
+{
+  typedef iterator_adaptor<normal_iterator<Pointer>, Pointer> super_t;
+
+  public:
+    __host__ __device__
+    normal_iterator() {}
+
+    __host__ __device__
+    normal_iterator(Pointer p)
+      : super_t(p) {}
+    
+    template<typename OtherPointer>
+    __host__ __device__
+    normal_iterator(const normal_iterator<OtherPointer> &other,
+                    typename thrust::detail::enable_if_convertible<
+                      OtherPointer,
+                      Pointer
+                    >::type * = 0)
+      : super_t(other.base()) {}
+
+}; // end normal_iterator
+
+
+template<typename Pointer>
+  inline __host__ __device__ normal_iterator<Pointer> make_normal_iterator(Pointer ptr)
+{
+  return normal_iterator<Pointer>(ptr);
+}
+
+
+template<typename T> struct is_trivial_iterator< normal_iterator<T> > : public true_type {};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/permutation_iterator_base.h b/compat/thrust/iterator/detail/permutation_iterator_base.h
new file mode 100644
index 0000000..a145b88
--- /dev/null
+++ b/compat/thrust/iterator/detail/permutation_iterator_base.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+
+namespace thrust
+{
+
+template<typename,typename> class permutation_iterator;
+
+
+namespace detail
+{
+
+template<typename ElementIterator,
+         typename IndexIterator>
+  struct permutation_iterator_base
+{
+  typedef typename thrust::iterator_system<ElementIterator>::type System1;
+  typedef typename thrust::iterator_system<IndexIterator>::type System2;
+
+  typedef thrust::iterator_adaptor<
+    permutation_iterator<ElementIterator,IndexIterator>,
+    IndexIterator,
+    typename thrust::iterator_value<ElementIterator>::type,
+    typename detail::minimum_system<System1,System2>::type,
+    thrust::use_default,
+    typename thrust::iterator_reference<ElementIterator>::type
+  > type;
+}; // end permutation_iterator_base
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/retag.h b/compat/thrust/iterator/detail/retag.h
new file mode 100644
index 0000000..4417fa5
--- /dev/null
+++ b/compat/thrust/iterator/detail/retag.h
@@ -0,0 +1,140 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tagged_iterator.h>
+#include <thrust/detail/pointer.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+
+// we can retag an iterator if FromTag converts to ToTag
+// or vice versa
+template<typename FromTag, typename ToTag>
+  struct is_retaggable
+    : integral_constant<
+        bool,
+        (is_convertible<FromTag,ToTag>::value || is_convertible<ToTag,FromTag>::value)
+      >
+{};
+
+
+template<typename FromTag, typename ToTag, typename Result>
+  struct enable_if_retaggable
+    : enable_if<
+        is_retaggable<FromTag,ToTag>::value,
+        Result
+      >
+{}; // end enable_if_retaggable
+
+
+} // end detail
+
+
+template<typename Tag, typename Iterator>
+  thrust::detail::tagged_iterator<Iterator,Tag>
+    reinterpret_tag(Iterator iter)
+{
+  return thrust::detail::tagged_iterator<Iterator,Tag>(iter);
+} // end reinterpret_tag()
+
+
+// specialization for raw pointer
+template<typename Tag, typename T>
+  thrust::pointer<T,Tag>
+    reinterpret_tag(T *ptr)
+{
+  return thrust::pointer<T,Tag>(ptr);
+} // end reinterpret_tag()
+
+
+// specialization for thrust::pointer
+template<typename Tag, typename T, typename OtherTag, typename Reference, typename Derived>
+  thrust::pointer<T,Tag>
+    reinterpret_tag(thrust::pointer<T,OtherTag,Reference,Derived> ptr)
+{
+  return reinterpret_tag<Tag>(ptr.get());
+} // end reinterpret_tag()
+
+
+// avoid deeply-nested tagged_iterator
+template<typename Tag, typename BaseIterator, typename OtherTag>
+  thrust::detail::tagged_iterator<BaseIterator,Tag>
+    reinterpret_tag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
+{
+  return reinterpret_tag<Tag>(iter.base());
+} // end reinterpret_tag()
+
+
+template<typename Tag, typename Iterator>
+  typename thrust::detail::enable_if_retaggable<
+    typename thrust::iterator_system<Iterator>::type,
+    Tag,
+    thrust::detail::tagged_iterator<Iterator,Tag>
+  >::type
+    retag(Iterator iter)
+{
+  return reinterpret_tag<Tag>(iter);
+} // end retag()
+
+
+// specialization for raw pointer
+template<typename Tag, typename T>
+  typename thrust::detail::enable_if_retaggable<
+    typename thrust::iterator_system<T*>::type,
+    Tag,
+    thrust::pointer<T,Tag>
+  >::type
+    retag(T *ptr)
+{
+  return reinterpret_tag<Tag>(ptr);
+} // end retag()
+
+
+// specialization for thrust::pointer
+template<typename Tag, typename T, typename OtherTag>
+  typename thrust::detail::enable_if_retaggable<
+    OtherTag,
+    Tag,
+    thrust::pointer<T,Tag>
+  >::type
+    retag(thrust::pointer<T,OtherTag> ptr)
+{
+  return reinterpret_tag<Tag>(ptr);
+} // end retag()
+
+
+// avoid deeply-nested tagged_iterator
+template<typename Tag, typename BaseIterator, typename OtherTag>
+  typename thrust::detail::enable_if_retaggable<
+    OtherTag,
+    Tag,
+    thrust::detail::tagged_iterator<BaseIterator,Tag>
+  >::type
+    retag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
+{
+  return reinterpret_tag<Tag>(iter);
+} // end retag()
+
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/reverse_iterator.inl b/compat/thrust/iterator/detail/reverse_iterator.inl
new file mode 100644
index 0000000..03e9032
--- /dev/null
+++ b/compat/thrust/iterator/detail/reverse_iterator.inl
@@ -0,0 +1,108 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+namespace detail
+{
+
+__thrust_hd_warning_disable__
+template<typename Iterator>
+__host__ __device__
+  Iterator prior(Iterator x)
+{
+  return --x;
+} // end prior()
+
+} // end detail
+
+template<typename BidirectionalIterator>
+  reverse_iterator<BidirectionalIterator>
+    ::reverse_iterator(BidirectionalIterator x)
+      :super_t(x)
+{
+} // end reverse_iterator::reverse_iterator()
+
+template<typename BidirectionalIterator>
+  template<typename OtherBidirectionalIterator>
+    reverse_iterator<BidirectionalIterator>
+      ::reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
+// XXX msvc screws this up
+#ifndef _MSC_VER
+                     , typename thrust::detail::enable_if<
+                         thrust::detail::is_convertible<
+                           OtherBidirectionalIterator,
+                           BidirectionalIterator
+                         >::value
+                       >::type *
+#endif // _MSC_VER
+                     )
+        :super_t(r.base())
+{
+} // end reverse_iterator::reverse_iterator()
+
+template<typename BidirectionalIterator>
+  typename reverse_iterator<BidirectionalIterator>::super_t::reference
+    reverse_iterator<BidirectionalIterator>
+      ::dereference(void) const
+{
+  return *thrust::detail::prior(this->base());
+} // end reverse_iterator::increment()
+
+template<typename BidirectionalIterator>
+  void reverse_iterator<BidirectionalIterator>
+    ::increment(void)
+{
+  --this->base_reference();
+} // end reverse_iterator::increment()
+
+template<typename BidirectionalIterator>
+  void reverse_iterator<BidirectionalIterator>
+    ::decrement(void)
+{
+  ++this->base_reference();
+} // end reverse_iterator::decrement()
+
+template<typename BidirectionalIterator>
+  void reverse_iterator<BidirectionalIterator>
+    ::advance(typename super_t::difference_type n)
+{
+  this->base_reference() += -n;
+} // end reverse_iterator::advance()
+
+template<typename BidirectionalIterator>
+  template<typename OtherBidirectionalIterator>
+    typename reverse_iterator<BidirectionalIterator>::super_t::difference_type
+      reverse_iterator<BidirectionalIterator>
+        ::distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const
+{
+  return this->base_reference() - y.base();
+} // end reverse_iterator::distance_to()
+
+template<typename BidirectionalIterator>
+__host__ __device__
+reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x)
+{
+  return reverse_iterator<BidirectionalIterator>(x);
+} // end make_reverse_iterator()
+
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/reverse_iterator_base.h b/compat/thrust/iterator/detail/reverse_iterator_base.h
new file mode 100644
index 0000000..c10c5b7
--- /dev/null
+++ b/compat/thrust/iterator/detail/reverse_iterator_base.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+template <typename> class reverse_iterator;
+
+namespace detail
+{
+
+template<typename BidirectionalIterator>
+  struct reverse_iterator_base
+{
+  typedef thrust::iterator_adaptor<
+    thrust::reverse_iterator<BidirectionalIterator>,
+    BidirectionalIterator
+  > type;
+}; // end reverse_iterator_base
+
+} // end detail
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/tagged_iterator.h b/compat/thrust/iterator/detail/tagged_iterator.h
new file mode 100644
index 0000000..69e6445
--- /dev/null
+++ b/compat/thrust/iterator/detail/tagged_iterator.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/use_default.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+template <typename,typename> class tagged_iterator;
+
+template<typename Iterator, typename Tag>
+  struct tagged_iterator_base
+{
+  typedef thrust::iterator_adaptor<
+    tagged_iterator<Iterator,Tag>,
+    Iterator,
+    typename thrust::iterator_value<Iterator>::type,
+    Tag,
+    typename thrust::iterator_traversal<Iterator>::type,
+    typename thrust::iterator_reference<Iterator>::type,
+    typename thrust::iterator_difference<Iterator>::type
+  > type;
+}; // end tagged_iterator_base
+
+template<typename Iterator, typename Tag>
+  class tagged_iterator
+    : public tagged_iterator_base<Iterator,Tag>::type
+{
+  private:
+    typedef typename tagged_iterator_base<Iterator,Tag>::type super_t;
+
+  public:
+    __host__ __device__
+    tagged_iterator(void) {}
+
+    __host__ __device__
+    explicit tagged_iterator(Iterator x)
+      : super_t(x) {}
+}; // end tagged_iterator
+
+
+// specialize is_trivial_iterator for tagged_iterator
+template<typename> struct is_trivial_iterator;
+
+// tagged_iterator is trivial if its base iterator is
+template<typename BaseIterator, typename Tag>
+  struct is_trivial_iterator<tagged_iterator<BaseIterator,Tag> >
+    : is_trivial_iterator<BaseIterator>
+{};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/transform_iterator.inl b/compat/thrust/iterator/detail/transform_iterator.inl
new file mode 100644
index 0000000..a5a36a7
--- /dev/null
+++ b/compat/thrust/iterator/detail/transform_iterator.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/result_of.h>
+
+namespace thrust
+{
+
+template <class UnaryFunction, class Iterator, class Reference, class Value>
+  class transform_iterator;
+  
+namespace detail 
+{
+
+// Compute the iterator_adaptor instantiation to be used for transform_iterator
+template <class UnaryFunc, class Iterator, class Reference, class Value>
+struct transform_iterator_base
+{
+ private:
+    // By default, dereferencing the iterator yields the same as the function.
+    typedef typename thrust::detail::ia_dflt_help<
+      Reference,
+      thrust::detail::result_of<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
+    >::type reference;
+
+    // To get the default for Value: remove any reference on the
+    // result type, but retain any constness to signal
+    // non-writability.  Note that if we adopt Thomas' suggestion
+    // to key non-writability *only* on the Reference argument,
+    // we'd need to strip constness here as well.
+    typedef typename thrust::detail::ia_dflt_help<
+      Value,
+      thrust::detail::remove_reference<reference>
+    >::type cv_value_type;
+
+ public:
+    typedef thrust::iterator_adaptor
+    <
+        transform_iterator<UnaryFunc, Iterator, Reference, Value>
+      , Iterator
+      , cv_value_type
+      , thrust::use_default   // Leave the system alone
+        //, thrust::use_default   // Leave the traversal alone
+        // use the Iterator's category to let any system iterators remain random access even though
+        // transform_iterator's reference type may not be a reference
+        // XXX figure out why only iterators whose reference types are true references are random access
+        , typename thrust::iterator_traits<Iterator>::iterator_category
+      , reference
+    > type;
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/tuple_of_iterator_references.h b/compat/thrust/iterator/detail/tuple_of_iterator_references.h
new file mode 100644
index 0000000..fdbf6b8
--- /dev/null
+++ b/compat/thrust/iterator/detail/tuple_of_iterator_references.h
@@ -0,0 +1,246 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/tuple.h>
+#include <thrust/pair.h>
+#include <thrust/detail/reference_forward_declaration.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+  
+template<
+  typename T0, typename T1, typename T2,
+  typename T3, typename T4, typename T5,
+  typename T6, typename T7, typename T8,
+  typename T9
+>
+  class tuple_of_iterator_references
+    : public thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+{
+  private:
+    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> super_t;
+
+  public:
+    // allow implicit construction from tuple<refs>
+    inline __host__ __device__
+    tuple_of_iterator_references(const super_t &other)
+      : super_t(other)
+    {}
+
+    // allow assignment from tuples
+    // XXX might be worthwhile to guard this with an enable_if is_assignable
+    template<typename U1, typename U2>
+    inline __host__ __device__
+    tuple_of_iterator_references &operator=(const detail::cons<U1,U2> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+    // allow assignment from pairs
+    // XXX might be worthwhile to guard this with an enable_if is_assignable
+    template<typename U1, typename U2>
+    inline __host__ __device__
+    tuple_of_iterator_references &operator=(const thrust::pair<U1,U2> &other)
+    {
+      super_t::operator=(other);
+      return *this;
+    }
+
+    // allow assignment from reference<tuple>
+    // XXX perhaps we should generalize to reference<T>
+    //     we could captures reference<pair> this way
+    template<typename U0, typename U1, typename U2,
+             typename U3, typename U4, typename U5,
+             typename U6, typename U7, typename U8,
+             typename U9,
+             typename Pointer, typename Derived>
+    inline __host__ __device__
+// XXX gcc-4.2 crashes on is_assignable
+//    typename thrust::detail::enable_if<
+//      thrust::detail::is_assignable<
+//        super_t,
+//        const thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>
+//      >::value,
+//      tuple_of_iterator_references &
+//    >::type
+    tuple_of_iterator_references &
+    operator=(const thrust::reference<thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>, Pointer, Derived> &other)
+    {
+      typedef thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> tuple_type;
+
+      // XXX perhaps this could be accelerated
+      tuple_type other_tuple = other;
+      super_t::operator=(other_tuple);
+      return *this;
+    }
+
+
+    // duplicate thrust::tuple's constructors
+    inline __host__ __device__
+    tuple_of_iterator_references() {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0)
+      : super_t(t0,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1)
+      : super_t(t0, t1,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2)
+      : super_t(t0, t1, t2,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3)
+      : super_t(t0, t1, t2, t3,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4)
+      : super_t(t0, t1, t2, t3, t4,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5)
+      : super_t(t0, t1, t2, t3, t4, t5,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6)
+      : super_t(t0, t1, t2, t3, t4, t5, t6,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7,
+                                 typename access_traits<T8>::parameter_type t8)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8,
+                static_cast<const null_type&>(null_type()))
+    {}
+
+    inline __host__ __device__ 
+    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
+                                 typename access_traits<T1>::parameter_type t1,
+                                 typename access_traits<T2>::parameter_type t2,
+                                 typename access_traits<T3>::parameter_type t3,
+                                 typename access_traits<T4>::parameter_type t4,
+                                 typename access_traits<T5>::parameter_type t5,
+                                 typename access_traits<T6>::parameter_type t6,
+                                 typename access_traits<T7>::parameter_type t7,
+                                 typename access_traits<T8>::parameter_type t8,
+                                 typename access_traits<T9>::parameter_type t9)
+      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+    {}
+};
+
+
+} // end detail
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/universal_categories.h b/compat/thrust/iterator/detail/universal_categories.h
new file mode 100644
index 0000000..7c39222
--- /dev/null
+++ b/compat/thrust/iterator/detail/universal_categories.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_categories.h>
+
+namespace thrust
+{
+
+// define these types without inheritance to avoid ambiguous conversion to base classes
+
+struct input_universal_iterator_tag
+{
+  operator input_host_iterator_tag () {return input_host_iterator_tag();}
+
+  operator input_device_iterator_tag () {return input_device_iterator_tag();}
+};
+
+struct output_universal_iterator_tag
+{
+  operator output_host_iterator_tag () {return output_host_iterator_tag();}
+
+  operator output_device_iterator_tag () {return output_device_iterator_tag();}
+};
+
+struct forward_universal_iterator_tag
+  : input_universal_iterator_tag
+{
+  operator forward_host_iterator_tag () {return forward_host_iterator_tag();};
+
+  operator forward_device_iterator_tag () {return forward_device_iterator_tag();};
+};
+
+struct bidirectional_universal_iterator_tag
+  : forward_universal_iterator_tag
+{
+  operator bidirectional_host_iterator_tag () {return bidirectional_host_iterator_tag();};
+
+  operator bidirectional_device_iterator_tag () {return bidirectional_device_iterator_tag();};
+};
+
+
+namespace detail
+{
+
+// create this struct to control conversion precedence in random_access_universal_iterator_tag
+template<typename T>
+struct one_degree_of_separation
+  : T
+{
+};
+
+} // end detail
+
+
+struct random_access_universal_iterator_tag
+{
+  // these conversions are all P0
+  operator random_access_host_iterator_tag () {return random_access_host_iterator_tag();};
+
+  operator random_access_device_iterator_tag () {return random_access_device_iterator_tag();};
+
+  // bidirectional_universal_iterator_tag is P1
+  operator detail::one_degree_of_separation<bidirectional_universal_iterator_tag> () {return detail::one_degree_of_separation<bidirectional_universal_iterator_tag>();}
+
+};
+
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/zip_iterator.inl b/compat/thrust/iterator/detail/zip_iterator.inl
new file mode 100644
index 0000000..fddd0ad
--- /dev/null
+++ b/compat/thrust/iterator/detail/zip_iterator.inl
@@ -0,0 +1,151 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/tuple_transform.h>
+
+namespace thrust
+{
+
+
+template <typename IteratorTuple>
+  zip_iterator<IteratorTuple>
+    ::zip_iterator(void)
+{
+} // end zip_iterator::zip_iterator()
+
+
+template <typename IteratorTuple>
+  zip_iterator<IteratorTuple>
+    ::zip_iterator(IteratorTuple iterator_tuple)
+      :m_iterator_tuple(iterator_tuple)
+{
+} // end zip_iterator::zip_iterator()
+
+
+template <typename IteratorTuple>
+  template <typename OtherIteratorTuple>
+    zip_iterator<IteratorTuple>
+      ::zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
+                     typename thrust::detail::enable_if_convertible<
+                       OtherIteratorTuple,
+                       IteratorTuple
+                     >::type *)
+        :m_iterator_tuple(other.get_iterator_tuple())
+{
+} // end zip_iterator::zip_iterator()
+
+
+template <typename IteratorTuple>
+const IteratorTuple &zip_iterator<IteratorTuple>
+  ::get_iterator_tuple(void) const
+{
+  return m_iterator_tuple;
+} // end zip_iterator::get_iterator_tuple()
+
+
+template <typename IteratorTuple>
+  typename zip_iterator<IteratorTuple>::super_t::reference
+    zip_iterator<IteratorTuple>
+      ::dereference(void) const
+{
+  using namespace detail::tuple_impl_specific;
+
+  return thrust::detail::tuple_host_device_transform<detail::dereference_iterator::template apply>(get_iterator_tuple(), detail::dereference_iterator());
+} // end zip_iterator::dereference()
+
+
+__thrust_hd_warning_disable__
+template <typename IteratorTuple>
+  template <typename OtherIteratorTuple>
+    bool zip_iterator<IteratorTuple>
+      ::equal(const zip_iterator<OtherIteratorTuple> &other) const
+{
+  return get<0>(get_iterator_tuple()) == get<0>(other.get_iterator_tuple());
+} // end zip_iterator::equal()
+
+
+template <typename IteratorTuple>
+  void zip_iterator<IteratorTuple>
+    ::advance(typename super_t::difference_type n)
+{
+  using namespace detail::tuple_impl_specific;
+
+  // XXX note that we use a pointer to System to dispatch to avoid
+  //     default construction of a System
+  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
+
+  // dispatch on system
+  tuple_for_each(m_iterator_tuple,
+                 detail::advance_iterator<typename super_t::difference_type>(n),
+                 use_me_to_dispatch);
+} // end zip_iterator::advance()
+
+
+template <typename IteratorTuple>
+  void zip_iterator<IteratorTuple>
+    ::increment(void)
+{
+  using namespace detail::tuple_impl_specific;
+
+  // XXX note that we use a pointer to System to dispatch to avoid
+  //     default construction of a System
+  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
+
+  // dispatch on system
+  tuple_for_each(m_iterator_tuple, detail::increment_iterator(),
+                 use_me_to_dispatch);
+} // end zip_iterator::increment()
+
+
+template <typename IteratorTuple>
+  void zip_iterator<IteratorTuple>
+    ::decrement(void)
+{
+  using namespace detail::tuple_impl_specific;
+
+  // XXX note that we use a pointer to System to dispatch to avoid
+  //     default construction of a System
+  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
+
+  // dispatch on system
+  tuple_for_each(m_iterator_tuple, detail::decrement_iterator(),
+                 use_me_to_dispatch);
+} // end zip_iterator::decrement()
+
+
+__thrust_hd_warning_disable__
+template <typename IteratorTuple>
+  template <typename OtherIteratorTuple>
+    typename zip_iterator<IteratorTuple>::super_t::difference_type
+      zip_iterator<IteratorTuple>
+        ::distance_to(const zip_iterator<OtherIteratorTuple> &other) const
+{
+  return get<0>(other.get_iterator_tuple()) - get<0>(get_iterator_tuple());
+} // end zip_iterator::distance_to()
+
+
+template <typename IteratorTuple>
+  zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t)
+{
+  return zip_iterator<IteratorTuple>(t);
+} // end make_zip_iterator()
+
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/detail/zip_iterator_base.h b/compat/thrust/iterator/detail/zip_iterator_base.h
new file mode 100644
index 0000000..9dd7789
--- /dev/null
+++ b/compat/thrust/iterator/detail/zip_iterator_base.h
@@ -0,0 +1,418 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/detail/minimum_category.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/tuple_meta_transform.h>
+#include <thrust/detail/tuple_transform.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/tuple_of_iterator_references.h>
+
+namespace thrust
+{
+
+// forward declare zip_iterator for zip_iterator_base
+template<typename IteratorTuple> class zip_iterator;
+
+namespace detail
+{
+
+
+// Functors to be used with tuple algorithms
+//
+template<typename DiffType>
+class advance_iterator
+{
+public:
+  inline __host__ __device__
+  advance_iterator(DiffType step) : m_step(step) {}
+  
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it) const
+  { it += m_step; }
+
+private:
+  DiffType m_step;
+}; // end advance_iterator
+
+
+struct increment_iterator
+{
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it)
+  { ++it; }
+}; // end increment_iterator
+
+
+struct decrement_iterator
+{
+  template<typename Iterator>
+  inline __host__ __device__
+  void operator()(Iterator& it)
+  { --it; }
+}; // end decrement_iterator
+
+
+struct dereference_iterator
+{
+  template<typename Iterator>
+  struct apply
+  { 
+    typedef typename
+      iterator_traits<Iterator>::reference
+    type;
+  }; // end apply
+
+  // XXX silence warnings of the form "calling a __host__ function from a __host__ __device__ function is not allowed
+  __thrust_hd_warning_disable__
+  template<typename Iterator>
+  __host__ __device__
+    typename apply<Iterator>::type operator()(Iterator const& it)
+  {
+    return *it;
+  }
+}; // end dereference_iterator
+
+
+// The namespace tuple_impl_specific provides two meta-
+// algorithms and two algorithms for tuples.
+namespace tuple_impl_specific
+{
+
+// define apply1 for tuple_meta_transform_impl
+template<typename UnaryMetaFunctionClass, class Arg>
+  struct apply1
+    : UnaryMetaFunctionClass::template apply<Arg>
+{
+}; // end apply1
+
+
+// define apply2 for tuple_meta_accumulate_impl
+template<typename UnaryMetaFunctionClass, class Arg1, class Arg2>
+  struct apply2
+    : UnaryMetaFunctionClass::template apply<Arg1,Arg2>
+{
+}; // end apply2
+
+
+// Meta-accumulate algorithm for tuples. Note: The template 
+// parameter StartType corresponds to the initial value in 
+// ordinary accumulation.
+//
+template<class Tuple, class BinaryMetaFun, class StartType>
+  struct tuple_meta_accumulate;
+
+template<
+    typename Tuple
+  , class BinaryMetaFun
+  , typename StartType
+>
+  struct tuple_meta_accumulate_impl
+{
+   typedef typename apply2<
+       BinaryMetaFun
+     , typename Tuple::head_type
+     , typename tuple_meta_accumulate<
+           typename Tuple::tail_type
+         , BinaryMetaFun
+         , StartType 
+       >::type
+   >::type type;
+};
+
+
+template<
+    typename Tuple
+  , class BinaryMetaFun
+  , typename StartType
+>
+struct tuple_meta_accumulate
+  : thrust::detail::eval_if<
+        thrust::detail::is_same<Tuple, thrust::null_type>::value
+      , thrust::detail::identity_<StartType>
+      , tuple_meta_accumulate_impl<
+            Tuple
+          , BinaryMetaFun
+          , StartType
+        >
+    > // end eval_if
+{
+}; // end tuple_meta_accumulate
+
+
+// transform algorithm for tuples. The template parameter Fun
+// must be a unary functor which is also a unary metafunction
+// class that computes its return type based on its argument
+// type. For example:
+//
+// struct to_ptr
+// {
+//     template <class Arg>
+//     struct apply
+//     {
+//          typedef Arg* type;
+//     }
+//
+//     template <class Arg>
+//     Arg* operator()(Arg x);
+// };
+
+
+
+// for_each algorithm for tuples.
+//
+template<typename Fun, typename System>
+inline __host__ __device__
+Fun tuple_for_each(thrust::null_type, Fun f, System *)
+{
+  return f;
+} // end tuple_for_each()
+
+
+template<typename Tuple, typename Fun, typename System>
+inline __host__ __device__
+Fun tuple_for_each(Tuple& t, Fun f, System *dispatch_tag)
+{ 
+  f( t.get_head() );
+  return tuple_for_each(t.get_tail(), f, dispatch_tag);
+} // end tuple_for_each()
+
+
+template<typename Tuple, typename Fun>
+inline __host__ __device__
+Fun tuple_for_each(Tuple& t, Fun f, thrust::host_system_tag *dispatch_tag)
+{ 
+// XXX this path is required in order to accomodate pure host iterators
+//     (such as std::vector::iterator) in a zip_iterator
+#ifndef __CUDA_ARCH__
+  f( t.get_head() );
+  return tuple_for_each(t.get_tail(), f, dispatch_tag);
+#else
+  // this code will never be called
+  return f;
+#endif
+} // end tuple_for_each()
+
+
+// Equality of tuples. NOTE: "==" for tuples currently (7/2003)
+// has problems under some compilers, so I just do my own.
+// No point in bringing in a bunch of #ifdefs here. This is
+// going to go away with the next tuple implementation anyway.
+//
+__host__ __device__
+inline bool tuple_equal(thrust::null_type, thrust::null_type)
+{ return true; }
+
+
+template<typename Tuple1, typename Tuple2>
+__host__ __device__
+bool tuple_equal(Tuple1 const& t1, Tuple2 const& t2)
+{ 
+  return t1.get_head() == t2.get_head() && 
+  tuple_equal(t1.get_tail(), t2.get_tail());
+} // end tuple_equal()
+
+} // end end tuple_impl_specific
+
+
+// Metafunction to obtain the type of the tuple whose element types
+// are the value_types of an iterator tupel.
+//
+template<typename IteratorTuple>
+  struct tuple_of_value_types
+    : tuple_meta_transform<
+          IteratorTuple,
+          iterator_value
+        >
+{
+}; // end tuple_of_value_types
+
+
+struct minimum_category_lambda
+{
+  template<typename T1, typename T2>
+    struct apply : minimum_category<T1,T2>
+  {};
+};
+
+
+
+// Metafunction to obtain the minimal traversal tag in a tuple
+// of iterators.
+//
+template<typename IteratorTuple>
+struct minimum_traversal_category_in_iterator_tuple
+{
+  typedef typename tuple_meta_transform<
+      IteratorTuple
+    , thrust::iterator_traversal
+  >::type tuple_of_traversal_tags;
+      
+  typedef typename tuple_impl_specific::tuple_meta_accumulate<
+      tuple_of_traversal_tags
+    , minimum_category_lambda
+    , thrust::random_access_traversal_tag
+  >::type type;
+};
+
+
+struct minimum_system_lambda
+{
+  template<typename T1, typename T2>
+    struct apply : minimum_system<T1,T2>
+  {};
+};
+
+
+
+// Metafunction to obtain the minimal system tag in a tuple
+// of iterators.
+template<typename IteratorTuple>
+struct minimum_system_in_iterator_tuple
+{
+  typedef typename thrust::detail::tuple_meta_transform<
+    IteratorTuple,
+    thrust::iterator_system
+  >::type tuple_of_system_tags;
+
+  typedef typename tuple_impl_specific::tuple_meta_accumulate<
+    tuple_of_system_tags,
+    minimum_system_lambda,
+    thrust::any_system_tag
+  >::type type;
+};
+
+namespace zip_iterator_base_ns
+{
+
+
+template<int i, typename Tuple>
+  struct tuple_elements_helper
+    : eval_if<
+        (i < tuple_size<Tuple>::value),
+        tuple_element<i,Tuple>,
+        identity_<thrust::null_type>
+      >
+{};
+
+
+template<typename Tuple>
+  struct tuple_elements
+{
+  typedef typename tuple_elements_helper<0,Tuple>::type T0;
+  typedef typename tuple_elements_helper<1,Tuple>::type T1;
+  typedef typename tuple_elements_helper<2,Tuple>::type T2;
+  typedef typename tuple_elements_helper<3,Tuple>::type T3;
+  typedef typename tuple_elements_helper<4,Tuple>::type T4;
+  typedef typename tuple_elements_helper<5,Tuple>::type T5;
+  typedef typename tuple_elements_helper<6,Tuple>::type T6;
+  typedef typename tuple_elements_helper<7,Tuple>::type T7;
+  typedef typename tuple_elements_helper<8,Tuple>::type T8;
+  typedef typename tuple_elements_helper<9,Tuple>::type T9;
+};
+
+
+template<typename IteratorTuple>
+  struct tuple_of_iterator_references
+{
+  // get a thrust::tuple of the iterators' references
+  typedef typename tuple_meta_transform<
+    IteratorTuple,
+    iterator_reference
+  >::type tuple_of_references;
+
+  // get at the individual tuple element types by name
+  typedef tuple_elements<tuple_of_references> elements;
+
+  // map thrust::tuple<T...> to tuple_of_iterator_references<T...>
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename elements::T0,
+    typename elements::T1,
+    typename elements::T2,
+    typename elements::T3,
+    typename elements::T4,
+    typename elements::T5,
+    typename elements::T6,
+    typename elements::T7,
+    typename elements::T8,
+    typename elements::T9
+  > type;
+};
+
+
+} // end zip_iterator_base_ns
+
+///////////////////////////////////////////////////////////////////
+//
+// Class zip_iterator_base
+//
+// Builds and exposes the iterator facade type from which the zip 
+// iterator will be derived.
+//
+template<typename IteratorTuple>
+  struct zip_iterator_base
+{
+ //private:
+    // reference type is the type of the tuple obtained from the
+    // iterators' reference types.
+    typedef typename zip_iterator_base_ns::tuple_of_iterator_references<IteratorTuple>::type reference;
+
+    // Boost's Value type is the same as reference type.
+    //typedef reference value_type;
+    typedef typename tuple_of_value_types<IteratorTuple>::type value_type;
+
+    // Difference type is the first iterator's difference type
+    typedef typename thrust::iterator_traits<
+      typename thrust::tuple_element<0, IteratorTuple>::type
+    >::difference_type difference_type;
+
+    // Iterator system is the minimum system tag in the
+    // iterator tuple
+    typedef typename
+    minimum_system_in_iterator_tuple<IteratorTuple>::type system;
+
+    // Traversal category is the minimum traversal category in the
+    // iterator tuple
+    typedef typename
+    minimum_traversal_category_in_iterator_tuple<IteratorTuple>::type traversal_category;
+  
+ public:
+  
+    // The iterator facade type from which the zip iterator will
+    // be derived.
+    typedef thrust::iterator_facade<
+        zip_iterator<IteratorTuple>,
+        value_type,  
+        system,
+        traversal_category,
+        reference,
+        difference_type
+    > type;
+}; // end zip_iterator_base
+
+} // end detail
+
+} // end thrust
+
+
diff --git a/compat/thrust/iterator/discard_iterator.h b/compat/thrust/iterator/discard_iterator.h
new file mode 100644
index 0000000..6e089b5
--- /dev/null
+++ b/compat/thrust/iterator/discard_iterator.h
@@ -0,0 +1,171 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/discard_iterator.h
+ *  \brief An iterator which "discards" (ignores) values assigned to it upon dereference
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/discard_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p discard_iterator is an iterator which represents a special kind of pointer that
+ *  ignores values written to it upon dereference. This iterator is useful for ignoring
+ *  the output of certain algorithms without wasting memory capacity or bandwidth.
+ *  \p discard_iterator may also be used to count the size of an algorithm's output which
+ *  may not be known a priori.
+ *
+ *  The following code snippet demonstrates how to use \p discard_iterator to ignore
+ *  ignore one of the output ranges of reduce_by_key
+ *
+ *  \code
+ *  #include <thrust/iterator/discard_iterator.h>
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> keys(7), values(7);
+ *
+ *    keys[0] = 1;
+ *    keys[1] = 3;
+ *    keys[2] = 3;
+ *    keys[3] = 3;
+ *    keys[4] = 2;
+ *    keys[5] = 2;
+ *    keys[6] = 1;
+ *
+ *    values[0] = 9;
+ *    values[1] = 8;
+ *    values[2] = 7;
+ *    values[3] = 6;
+ *    values[4] = 5;
+ *    values[5] = 4;
+ *    values[6] = 3;
+ *
+ *    thrust::device_vector<int> result(4);
+ *
+ *    // we are only interested in the reduced values
+ *    // use discard_iterator to ignore the output keys
+ *    thrust::reduce_by_key(keys.begin(), keys.end(),
+ *                          values.begin(), values.end(),
+ *                          thrust::make_discard_iterator(),
+ *                          result.begin());
+ *    
+ *    // result is now [9, 21, 9, 3]
+ *    
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_discard_iterator
+ */
+template<typename System = use_default>
+  class discard_iterator
+    : public detail::discard_iterator_base<System>::type
+{
+    /*! \cond
+     */
+    friend class thrust::iterator_core_access;
+    typedef typename detail::discard_iterator_base<System>::type          super_t;
+    typedef typename detail::discard_iterator_base<System>::incrementable incrementable;
+    typedef typename detail::discard_iterator_base<System>::base_iterator base_iterator;
+
+  public:
+    typedef typename super_t::reference  reference;
+    typedef typename super_t::value_type value_type;
+
+    /*! \endcond
+     */
+
+    /*! Copy constructor copies from a source discard_iterator.
+     *
+     *  \p rhs The discard_iterator to copy.
+     */
+    __host__ __device__
+    discard_iterator(discard_iterator const &rhs)
+      : super_t(rhs.base()) {}
+
+    /*! This constructor receives an optional index specifying the position of this
+     *  \p discard_iterator in a range.
+     *  
+     *  \p i The index of this \p discard_iterator in a range. Defaults to the
+     *       value returned by \c Incrementable's null constructor. For example,
+     *       when <tt>Incrementable == int</tt>, \c 0.
+     */
+    __host__ __device__
+    discard_iterator(incrementable const &i = incrementable())
+      : super_t(base_iterator(i)) {}
+
+    /*! \cond
+     */
+  
+  private: // Core iterator interface
+    __host__ __device__
+    reference dereference(void) const
+    {
+      return m_element;
+    }
+
+    mutable value_type m_element;
+
+    /*! \endcond
+     */
+}; // end constant_iterator
+
+
+/*! \p make_discard_iterator creates a \p discard_iterator from an optional index parameter.
+ *
+ *  \param i The index of the returned \p discard_iterator within a range.
+ *           In the default case, the value of this parameter is \c 0.
+ *
+ *  \return A new \p discard_iterator with index as given by \p i.
+ *
+ *  \see constant_iterator
+ */
+inline __host__ __device__
+discard_iterator<> make_discard_iterator(discard_iterator<>::difference_type i = discard_iterator<>::difference_type(0))
+{
+  return discard_iterator<>(i);
+} // end make_discard_iterator()
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end namespace thrust
+  
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/compat/thrust/iterator/iterator_adaptor.h b/compat/thrust/iterator/iterator_adaptor.h
new file mode 100644
index 0000000..7b9cca3
--- /dev/null
+++ b/compat/thrust/iterator/iterator_adaptor.h
@@ -0,0 +1,239 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_adaptor.h
+ *  \brief An iterator which adapts a base iterator
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/detail/use_default.h>
+#include <thrust/iterator/detail/iterator_adaptor_base.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p iterator_adaptor is an iterator which adapts an existing type of iterator to create a new type of
+ *  iterator. Most of Thrust's fancy iterators are defined via inheritance from \p iterator_adaptor.
+ *  While composition of these existing Thrust iterators is often sufficient for expressing the desired
+ *  functionality, it is occasionally more straightforward to derive from \p iterator_adaptor directly.
+ *
+ *  To see how to use \p iterator_adaptor to create a novel iterator type, let's examine how to use it to
+ *  define \p repeat_iterator, a fancy iterator which repeats elements from another range a given number of time:
+ *
+ *  \code
+ *  #include <thrust/iterator/iterator_adaptor.h>
+ *
+ *  // derive repeat_iterator from iterator_adaptor
+ *  template<typename Iterator>
+ *    class repeat_iterator
+ *      : public thrust::iterator_adaptor<
+ *          repeat_iterator<Iterator>, // the first template parameter is the name of the iterator we're creating
+ *          Iterator                   // the second template parameter is the name of the iterator we're adapting
+ *                                     // we can use the default for the additional template parameters
+ *        >
+ *  {
+ *    public:
+ *      // shorthand for the name of the iterator_adaptor we're deriving from
+ *      typedef thrust::iterator_adaptor<
+ *        repeat_iterator<Iterator>,
+ *        Iterator
+ *      > super_t;
+ *
+ *      __host__ __device__
+ *      repeat_iterator(const Iterator &x, int n) : super_t(x), begin(x), n(n) {}
+ *
+ *      // befriend thrust::iterator_core_access to allow it access to the private interface below
+ *      friend class thrust::iterator_core_access;
+ *
+ *    private:
+ *      // repeat each element of the adapted range n times
+ *      unsigned int n;
+ *
+ *      // used to keep track of where we began
+ *      const Iterator begin;
+ *
+ *      // it is private because only thrust::iterator_core_access needs access to it
+ *      __host__ __device__
+ *      typename super_t::reference dereference() const
+ *      {
+ *        return *(begin + (this->base() - begin) / n);
+ *      }
+ *  };
+ *  \endcode
+ *
+ *  Except for the first two, \p iterator_adaptor's template parameters are optional. When omitted, or when the
+ *  user specifies \p thrust::use_default in its place, \p iterator_adaptor will use a default type inferred from \p Base.
+ *
+ *  \p iterator_adaptor's functionality is derived from and generally equivalent to \p boost::iterator_adaptor.
+ *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
+ *  to dispatch an algorithm to one of several parallel backend systems.
+ *
+ *  \p iterator_adaptor is a powerful tool for creating custom iterators directly. However, the large set of iterator semantics which must be satisfied
+ *  for algorithm compatibility can make \p iterator_adaptor difficult to use correctly. Unless you require the full expressivity of \p iterator_adaptor,
+ *  consider building a custom iterator through composition of existing higher-level fancy iterators instead. 
+ *
+ *  Interested users may refer to <tt>boost::iterator_adaptor</tt>'s documentation for further usage examples.
+ */
+template<typename Derived,
+         typename Base,
+         typename Value      = use_default,
+         typename System     = use_default,
+         typename Traversal  = use_default,
+         typename Reference  = use_default,
+         typename Difference = use_default>
+  class iterator_adaptor:
+    public detail::iterator_adaptor_base<
+      Derived, Base, Value, System, Traversal, Reference, Difference
+    >::type
+{
+  /*! \cond
+   */
+
+    friend class thrust::iterator_core_access;
+
+  protected:
+    typedef typename detail::iterator_adaptor_base<
+        Derived, Base, Value, System, Traversal, Reference, Difference
+    >::type super_t;
+
+  /*! \endcond
+   */
+  
+  public:
+    /*! \p iterator_adaptor's default constructor does nothing.
+     */
+    __host__ __device__
+    iterator_adaptor(){}
+
+    /*! This constructor copies from a given instance of the \p Base iterator.
+     */
+    __host__ __device__
+    explicit iterator_adaptor(Base const& iter)
+      : m_iterator(iter)
+    {}
+
+    /*! The type of iterator this \p iterator_adaptor's \p adapts.
+     */
+    typedef Base       base_type;
+                                                                                              
+    /*! \cond
+     */
+    typedef typename super_t::reference reference;
+                                                                                              
+    typedef typename super_t::difference_type difference_type;
+    /*! \endcond
+     */
+
+    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base const& base() const
+    { return m_iterator; }
+
+  protected:
+    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base const& base_reference() const
+    { return m_iterator; }
+
+    /*! \return A mutable reference to the \p Base iterator this \p iterator_adaptor adapts.
+     */
+    __host__ __device__
+    Base& base_reference()
+    { return m_iterator; }
+
+    /*! \cond
+     */
+  private: // Core iterator interface for iterator_facade
+
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    typename iterator_adaptor::reference dereference() const
+    { return *m_iterator; }
+
+    __thrust_hd_warning_disable__
+    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
+    __host__ __device__
+    bool equal(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& x) const
+    { return m_iterator == x.base(); }
+
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    void advance(typename iterator_adaptor::difference_type n)
+    {
+      // XXX statically assert on random_access_traversal_tag
+      m_iterator += n;
+    }
+
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    void increment()
+    { ++m_iterator; }
+
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    void decrement()
+    {
+      // XXX statically assert on bidirectional_traversal_tag
+      --m_iterator;
+    }
+
+    __thrust_hd_warning_disable__
+    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
+    __host__ __device__
+    typename iterator_adaptor::difference_type distance_to(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& y) const
+    { return y.base() - m_iterator; }
+
+  private:
+    Base m_iterator;
+
+    /*! \endcond
+     */
+}; // end iterator_adaptor
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/iterator_categories.h b/compat/thrust/iterator/iterator_categories.h
new file mode 100644
index 0000000..81601b4
--- /dev/null
+++ b/compat/thrust/iterator/iterator_categories.h
@@ -0,0 +1,191 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_categories.h
+ *  \brief Types for reasoning about the categories of iterators
+ */
+
+/*
+ * (C) Copyright Jeremy Siek 2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include this for stl's iterator tags
+#include <iterator>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \addtogroup iterator_tags Iterator Tags
+ *  \ingroup iterators
+ *  \addtogroup iterator_tag_classes Iterator Tag Classes
+ *  \ingroup iterator_tags
+ *  \{
+ */
+
+/*! \p input_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Input Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html, iterator_traits,
+ *  output_device_iterator_tag, forward_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct input_device_iterator_tag {};
+
+/*! \p output_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Output Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html, iterator_traits,
+ *  input_device_iterator_tag, forward_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct output_device_iterator_tag {};
+
+/*! \p forward_device_iterator_tag is an empty class: it has no member functions,
+ *  member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Device Iterator concept within the C++ type
+ *  system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html, iterator_traits,
+ *  input_device_iterator_tag, output_device_iterator_tag,
+ *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct forward_device_iterator_tag : public input_device_iterator_tag {};
+
+/*! \p bidirectional_device_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Bidirectional Device Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct bidirectional_device_iterator_tag : public forward_device_iterator_tag {};
+
+/*! \p random_access_device_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Random Access Device Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+struct random_access_device_iterator_tag : public bidirectional_device_iterator_tag {};
+
+/*! \p input_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Input Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  output_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::input_iterator_tag input_host_iterator_tag;
+
+/*! \p output_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Output Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, forward_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::output_iterator_tag output_host_iterator_tag;
+
+/*! \p forward_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::forward_iterator_tag forward_host_iterator_tag;
+
+/*! \p bidirectional_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  forward_host_iterator_tag, random_access_host_iterator_tag
+ */
+typedef std::bidirectional_iterator_tag bidirectional_host_iterator_tag;
+
+/*! \p random_access_host_iterator_tag is an empty class: it has no member
+ *  functions, member variables, or nested types. It is used solely as a "tag": a
+ *  representation of the Forward Host Iterator concept within the C++
+ *  type system.
+ *
+ *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
+ *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
+ *  random_access_device_iterator_tag,
+ *  input_host_iterator_tag, output_host_iterator_tag,
+ *  forward_host_iterator_tag, bidirectional_host_iterator_tag
+ */
+typedef std::random_access_iterator_tag random_access_host_iterator_tag;
+
+/*! \} // end iterator_tag_classes
+ */
+
+} // end namespace thrust
+
+#include <thrust/iterator/detail/universal_categories.h>
+
diff --git a/compat/thrust/iterator/iterator_facade.h b/compat/thrust/iterator/iterator_facade.h
new file mode 100644
index 0000000..232c150
--- /dev/null
+++ b/compat/thrust/iterator/iterator_facade.h
@@ -0,0 +1,538 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/iterator_facade.h
+ *  \brief A class which exposes a public interface for iterators
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+#include <thrust/iterator/detail/distance_from_result.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+
+// This forward declaration is required for the friend declaration
+// in iterator_core_access
+template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> class iterator_facade;
+
+
+/*! \p iterator_core_access is the class which user iterator types derived from \p thrust::iterator_adaptor
+ *  or \p thrust::iterator_facade must befriend to allow it to access their private interface.
+ */
+class iterator_core_access
+{
+    /*! \cond
+     */
+
+    // declare our friends
+    template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> friend class iterator_facade;
+
+    // iterator comparisons are our friends
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend bool
+    operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    // iterator difference is our friend
+    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+    inline __host__ __device__
+    friend
+      typename thrust::detail::distance_from_result<
+        iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
+        iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
+      >::type
+    operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+              iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
+
+    template<typename Facade>
+    __host__ __device__
+    static typename Facade::reference dereference(Facade const& f)
+    {
+      return f.dereference();
+    }
+
+    template<typename Facade>
+    __host__ __device__
+    static void increment(Facade& f)
+    {
+      f.increment();
+    }
+
+    template<typename Facade>
+    __host__ __device__
+    static void decrement(Facade& f)
+    {
+      f.decrement();
+    }
+
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static bool equal(Facade1 const& f1, Facade2 const& f2)
+    {
+      return f1.equal(f2);
+    }
+
+    // XXX TODO: Investigate whether we need both of these cases
+    //template <class Facade1, class Facade2>
+    //__host__ __device__
+    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::true_)
+    //{
+    //  return f1.equal(f2);
+    //}
+
+    //template <class Facade1, class Facade2>
+    //__host__ __device__
+    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::false_)
+    //{
+    //  return f2.equal(f1);
+    //}
+
+    template <class Facade>
+    __host__ __device__
+    static void advance(Facade& f, typename Facade::difference_type n)
+    {
+      f.advance(n);
+    }
+
+    // Facade2 is convertible to Facade1,
+    // so return Facade1's difference_type
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename Facade1::difference_type
+      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::true_type)
+    {
+      return -f1.distance_to(f2);
+    }
+
+    // Facade2 is not convertible to Facade1,
+    // so return Facade2's difference_type
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename Facade2::difference_type
+      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::false_type)
+    {
+      return f2.distance_to(f1);
+    }
+    
+    template <class Facade1, class Facade2>
+    __host__ __device__
+    static typename thrust::detail::distance_from_result<Facade1,Facade2>::type
+      distance_from(Facade1 const& f1, Facade2 const& f2)
+    {
+      // dispatch the implementation of this method upon whether or not
+      // Facade2 is convertible to Facade1
+      return distance_from(f1, f2,
+        typename thrust::detail::is_convertible<Facade2,Facade1>::type());
+    }
+
+    //
+    // Curiously Recurring Template interface.
+    //
+    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+    __host__ __device__
+    static Derived& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference>& facade)
+    {
+      return *static_cast<Derived*>(&facade);
+    }
+
+    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+    __host__ __device__
+    static Derived const& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& facade)
+    {
+      return *static_cast<Derived const*>(&facade);
+    }
+
+    /*! \endcond
+     */
+}; // end iterator_core_access
+
+
+/*! \p iterator_facade is a template which allows the programmer to define a novel iterator with a standards-conforming interface
+ *  which Thrust can use to reason about algorithm acceleration opportunities.
+ *
+ *  Because most of a standard iterator's interface is defined in terms of a small set of core primitives, \p iterator_facade
+ *  defines the non-primitive portion mechanically. In principle a novel iterator could explicitly provide the entire interface in
+ *  an ad hoc fashion but doing so might be tedious and prone to subtle errors.
+ *
+ *  Often \p iterator_facade is too primitive a tool to use for defining novel iterators. In these cases, \p iterator_adaptor
+ *  or a specific fancy iterator should be used instead.
+ *
+ *  \p iterator_facade's functionality is derived from and generally equivalent to \p boost::iterator_facade.
+ *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
+ *  to dispatch an algorithm to one of several parallel backend systems. An additional exception is Thrust's omission
+ *  of the \c operator-> member function.
+ *
+ *  Interested users may refer to <tt>boost::iterator_facade</tt>'s documentation for usage examples.
+ *
+ *  \note \p iterator_facade's arithmetic operator free functions exist with the usual meanings but are omitted here for brevity.
+ */
+template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+  class iterator_facade
+{
+  private:
+    /*! \cond
+     */
+
+    //
+    // Curiously Recurring Template interface.
+    //
+    __host__ __device__
+    Derived& derived()
+    {
+      return *static_cast<Derived*>(this);
+    }
+
+    __host__ __device__
+    Derived const& derived() const
+    {
+      return *static_cast<Derived const*>(this);
+    }
+    /*! \endcond
+     */
+
+  public:
+    /*! The type of element pointed to by \p iterator_facade.
+     */
+    typedef typename thrust::detail::remove_const<Value>::type value_type;
+
+    /*! The return type of \p iterator_facade::operator*().
+     */
+    typedef Reference                                          reference;
+
+    /*! The return type of \p iterator_facade's non-existent \c operator->()
+     *  member function. Unlike \c boost::iterator_facade, \p iterator_facade
+     *  disallows access to the \p value_type's members through expressions of the
+     *  form <tt>iter->member</tt>. \p pointer is defined to \c void to indicate
+     *  that these expressions are not allowed. This limitation may be relaxed in a
+     *  future version of Thrust.
+     */
+    typedef void                                               pointer;
+
+    /*! The type of expressions of the form <tt>x - y</tt> where <tt>x</tt> and <tt>y</tt>
+     *  are of type \p iterator_facade.
+     */
+    typedef Difference                                         difference_type;
+
+    /*! The type of iterator category of \p iterator_facade.
+     */
+    typedef typename thrust::detail::iterator_facade_category<
+      System, Traversal, Value, Reference
+    >::type                                                    iterator_category;
+
+    /*! \p operator*() dereferences this \p iterator_facade.
+     *  \return A reference to the element pointed to by this \p iterator_facade.
+     */
+    __host__ __device__
+    reference operator*() const
+    {
+      return iterator_core_access::dereference(this->derived());
+    }
+
+    // XXX unimplemented for now, consider implementing it later
+    //pointer operator->() const
+    //{
+    //  return;
+    //}
+
+    // XXX investigate whether or not we need to go to the lengths
+    //     boost does to determine the return type
+
+    /*! \p operator[] performs indexed dereference.
+     *  \return A reference to the element \p n distance away from this \p iterator_facade.
+     */
+    __host__ __device__
+    reference operator[](difference_type n) const
+    {
+      return *(this->derived() + n);
+    }
+
+    /*! \p operator++ pre-increments this \p iterator_facade to refer to the element in the next position.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator++()
+    {
+      iterator_core_access::increment(this->derived());
+      return this->derived();
+    }
+
+    /*! \p operator++ post-increments this \p iterator_facade and returns a new \p iterator_facade referring to the element in the next position.
+     *  \return A copy of <tt>*this</tt> before increment.
+     */
+    __host__ __device__
+    Derived  operator++(int)
+    {
+      Derived tmp(this->derived());
+      ++*this;
+      return tmp;
+    }
+
+    /*! \p operator-- pre-decrements this \p iterator_facade to refer to the element in the previous position.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator--()
+    {
+      iterator_core_access::decrement(this->derived());
+      return this->derived();
+    }
+
+    /*! \p operator-- post-decrements this \p iterator_facade and returns a new \p iterator_facade referring to the element in the previous position.
+     *  \return A copy of <tt>*this</tt> before decrement.
+     */
+    __host__ __device__
+    Derived  operator--(int)
+    {
+      Derived tmp(this->derived());
+      --*this;
+      return tmp;
+    }
+
+    /*! \p operator+= increments this \p iterator_facade to refer to an element a given distance after its current position.
+     *  \param n The quantity to increment.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator+=(difference_type n)
+    {
+      iterator_core_access::advance(this->derived(), n);
+      return this->derived();
+    }
+
+    /*! \p operator-= decrements this \p iterator_facade to refer to an element a given distance before its current postition.
+     *  \param n The quantity to decrement.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    Derived& operator-=(difference_type n)
+    {
+      iterator_core_access::advance(this->derived(), -n);
+      return this->derived();
+    }
+
+    /*! \p operator- subtracts a given quantity from this \p iterator_facade and returns a new \p iterator_facade referring to the element at the given position before this \p iterator_facade.
+     *  \param n The quantity to decrement
+     *  \return An \p iterator_facade pointing \p n elements before this \p iterator_facade.
+     */
+    __host__ __device__
+    Derived  operator-(difference_type n) const
+    {
+      Derived result(this->derived());
+      return result -= n;
+    }
+}; // end iterator_facade
+
+/*! \cond
+ */
+
+// Comparison operators
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return iterator_core_access
+    ::equal(*static_cast<Derived1 const*>(&lhs),
+            *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return !iterator_core_access
+    ::equal(*static_cast<Derived1 const*>(&lhs),
+            *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 > iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 < iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 >= iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+// XXX it might be nice to implement this at some point
+//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
+bool
+operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return 0 <= iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+// Iterator difference
+template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
+          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
+inline __host__ __device__
+
+// divine the type this operator returns
+typename thrust::detail::distance_from_result<
+  iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
+  iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
+>::type
+
+operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
+          iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
+{
+  return iterator_core_access
+    ::distance_from(*static_cast<Derived1 const*>(&lhs),
+                    *static_cast<Derived2 const*>(&rhs));
+}
+
+// Iterator addition
+template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+inline __host__ __device__
+Derived operator+ (iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i,
+                   typename Derived::difference_type n)
+{
+  Derived tmp(static_cast<Derived const&>(i));
+  return tmp += n;
+}
+
+template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
+inline __host__ __device__
+Derived operator+ (typename Derived::difference_type n,
+                   iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i)
+{
+  Derived tmp(static_cast<Derived const&>(i));
+  return tmp += n;
+}
+
+/*! \endcond
+ */
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/iterator_traits.h b/compat/thrust/iterator/iterator_traits.h
new file mode 100644
index 0000000..a16f219
--- /dev/null
+++ b/compat/thrust/iterator/iterator_traits.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/iterator_traits.h
+ *  \brief Traits and metafunctions for reasoning about the traits of iterators
+ */
+
+/*
+ * (C) Copyright David Abrahams 2003.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <iterator>
+
+namespace thrust
+{
+
+/*! \p iterator_traits is a type trait class that provides a uniform
+ *  interface for querying the properties of iterators at compile-time.
+ */
+template<typename T>
+  struct iterator_traits
+    : public std::iterator_traits<T>
+{
+}; // end iterator_traits
+
+
+template<typename Iterator> struct iterator_value;
+
+template<typename Iterator> struct iterator_pointer;
+
+template<typename Iterator> struct iterator_reference;
+
+template<typename Iterator> struct iterator_difference;
+
+template<typename Iterator> struct iterator_traversal;
+
+template<typename Iterator> struct iterator_system;
+
+// TODO remove this in Thrust v1.7.0
+template<typename Iterator>
+  struct THRUST_DEPRECATED iterator_space
+{
+  typedef THRUST_DEPRECATED typename iterator_system<Iterator>::type type;
+};
+
+
+} // end thrust
+
+#include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/iterator/detail/host_system_tag.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/iterator/detail/iterator_traits.inl>
+
diff --git a/compat/thrust/iterator/permutation_iterator.h b/compat/thrust/iterator/permutation_iterator.h
new file mode 100644
index 0000000..509097b
--- /dev/null
+++ b/compat/thrust/iterator/permutation_iterator.h
@@ -0,0 +1,210 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/permutation_iterator.h
+ *  \brief An iterator which performs a gather or scatter operation when dereferenced
+ */
+
+/*
+ * (C) Copyright Toon Knapen    2001.
+ * (C) Copyright David Abrahams 2003.
+ * (C) Copyright Roland Richter 2003.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/permutation_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p permutation_iterator is an iterator which represents a pointer into a
+ *  reordered view of a given range. \p permutation_iterator is an imprecise name;
+ *  the reordered view need not be a strict permutation. This iterator is useful
+ *  for fusing a scatter or gather operation with other algorithms.
+ *
+ *  This iterator takes two arguments:
+ *
+ *    - an iterator to the range \c V on which the "permutation" will be applied
+ *    - the reindexing scheme that defines how the elements of \c V will be permuted.
+ *
+ *  Note that \p permutation_iterator is not limited to strict permutations of the
+ *  given range \c V. The distance between begin and end of the reindexing iterators
+ *  is allowed to be smaller compared to the size of the range \c V, in which case
+ *  the \p permutation_iterator only provides a "permutation" of a subrange of \c V.
+ *  The indices neither need to be unique. In this same context, it must be noted
+ *  that the past-the-end \p permutation_iterator is completely defined by means of
+ *  the past-the-end iterator to the indices.
+ *
+ *  The following code snippet demonstrates how to create a \p permutation_iterator
+ *  which represents a reordering of the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/permutation_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> values(4);
+ *  values[0] = 10.0f;
+ *  values[1] = 20.0f;
+ *  values[2] = 30.0f;
+ *  values[3] = 40.0f;
+ *  values[4] = 50.0f;
+ *  values[5] = 60.0f;
+ *  values[6] = 70.0f;
+ *  values[7] = 80.0f;
+ *
+ *  thrust::device_vector<int> indices(4);
+ *  indices[0] = 2;
+ *  indices[1] = 6;
+ *  indices[2] = 1;
+ *  indices[3] = 3;
+ *
+ *  typedef thrust::device_vector<float>::iterator ElementIterator;
+ *  typedef thrust::device_vector<int>::iterator   IndexIterator;
+ *
+ *  thrust::permutation_iterator<ElementIterator,IndexIterator> iter(values.begin(), indices.begin());
+ *
+ *  *iter;   // returns 30.0f;
+ *  iter[0]; // returns 30.0f;
+ *  iter[1]; // returns 70.0f;
+ *  iter[2]; // returns 20.0f;
+ *  iter[3]; // returns 40.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *
+ *  *iter   = -1.0f; // sets values[2] to -1.0f;
+ *  iter[0] = -1.0f; // sets values[2] to -1.0f;
+ *  iter[1] = -1.0f; // sets values[6] to -1.0f;
+ *  iter[2] = -1.0f; // sets values[1] to -1.0f;
+ *  iter[3] = -1.0f; // sets values[3] to -1.0f;
+ *
+ *  // values is now {10, -1, -1, -1, 50, 60, -1, 80}
+ *  \endcode
+ *
+ *  \see make_permutation_iterator
+ */
+template <typename ElementIterator,
+          typename IndexIterator>
+  class permutation_iterator
+    : public thrust::detail::permutation_iterator_base<
+        ElementIterator,
+        IndexIterator
+      >::type
+{
+  /*! \cond
+   */
+  private:
+    typedef typename detail::permutation_iterator_base<ElementIterator,IndexIterator>::type super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Null constructor calls the null constructor of this \p permutation_iterator's
+     *  element iterator.
+     */
+    __host__ __device__
+    permutation_iterator()
+      : m_element_iterator() {}
+
+    /*! Constructor accepts an \c ElementIterator into a range of values and an
+     *  \c IndexIterator into a range of indices defining the indexing scheme on the
+     *  values.
+     *
+     *  \param x An \c ElementIterator pointing this \p permutation_iterator's range of values.
+     *  \param y An \c IndexIterator pointing to an indexing scheme to use on \p x.
+     */
+    __host__ __device__
+    explicit permutation_iterator(ElementIterator x, IndexIterator y)
+      : super_t(y), m_element_iterator(x) {}
+
+    /*! Copy constructor accepts a related \p permutation_iterator.
+     *  \param r A compatible \p permutation_iterator to copy from.
+     */
+    template<typename OtherElementIterator, typename OtherIndexIterator>
+    __host__ __device__
+    permutation_iterator(permutation_iterator<OtherElementIterator,OtherIndexIterator> const &r
+    // XXX remove these guards when we have static_assert
+    , typename detail::enable_if_convertible<OtherElementIterator, ElementIterator>::type* = 0
+    , typename detail::enable_if_convertible<OtherIndexIterator, IndexIterator>::type* = 0
+    )
+      : super_t(r.base()), m_element_iterator(r.m_element_iterator)
+    {}
+
+  /*! \cond
+   */
+  private:
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return *(m_element_iterator + *this->base());
+    }
+
+    // make friends for the copy constructor
+    template<typename,typename> friend class permutation_iterator;
+
+    ElementIterator m_element_iterator;
+  /*! \endcond
+   */
+}; // end permutation_iterator
+
+
+/*! \p make_permutation_iterator creates a \p permutation_iterator
+ *  from an \c ElementIterator pointing to a range of elements to "permute"
+ *  and an \c IndexIterator pointing to a range of indices defining an indexing
+ *  scheme on the values.
+ *
+ *  \param e An \c ElementIterator pointing to a range of values.
+ *  \param i An \c IndexIterator pointing to an indexing scheme to use on \p e.
+ *  \return A new \p permutation_iterator which permutes the range \p e by \p i.
+ *  \see permutation_iterator
+ */
+template<typename ElementIterator, typename IndexIterator>
+__host__ __device__
+permutation_iterator<ElementIterator,IndexIterator> make_permutation_iterator(ElementIterator e, IndexIterator i)
+{
+  return permutation_iterator<ElementIterator,IndexIterator>(e,i);
+}
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/retag.h b/compat/thrust/iterator/retag.h
new file mode 100644
index 0000000..660da8f
--- /dev/null
+++ b/compat/thrust/iterator/retag.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/retag.h
+ *  \brief Functionality for altering an iterator's associated system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/retag.h>
+
+namespace thrust
+{
+
+
+/*! \ingroup iterator_tags
+ *  \{
+ */
+
+#if 0
+/*! \p reinterpret_tag returns a copy of an iterator and changes the type of the result's system tag.
+ *  \tparam Tag Any system tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is otherwise
+ *          equivalent to \p iter.
+ *  \note Unlike \p retag, \p reinterpret_tag does not enforce that the converted-to system tag be
+ *        related to the converted-from system tag.
+ *  \see retag
+ */
+template<typename Tag, typename Iterator>
+unspecified_iterator_type reinterpret_tag(Iterator iter);
+
+/*! \p retag returns a copy of an iterator and changes the type of the result's system tag.
+ *  \tparam Tag \p Tag shall be convertible to <tt>thrust::iterator_system<Iterator>::type</tt>,
+ *              or <tt>thrust::iterator_system<Iterator>::type</tt> is a base type of \p Tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is
+ *          otherwise equivalent to \p iter.
+ *  \note Unlike \p reinterpret_tag, \p retag enforces that the converted-to system tag be
+ *        related to the converted-from system tag.
+ *  \see reinterpret_tag
+ */
+template<typename Tag, typename Iterator>
+unspecified_iterator_type retag(Iterator iter);
+#endif
+
+/*! \} // iterator_tags
+ */
+
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/reverse_iterator.h b/compat/thrust/iterator/reverse_iterator.h
new file mode 100644
index 0000000..03f0339
--- /dev/null
+++ b/compat/thrust/iterator/reverse_iterator.h
@@ -0,0 +1,238 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/reverse_iterator.h
+ *  \brief An iterator adaptor which adapts another iterator to traverse backwards
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/detail/reverse_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p reverse_iterator is an iterator which represents a pointer into a
+ *  reversed view of a given range. In this way, \p reverse_iterator allows
+ *  backwards iteration through a bidirectional input range.
+ *
+ *  It is important to note that although \p reverse_iterator is constructed
+ *  from a given iterator, it points to the element preceding it. In this way,
+ *  the past-the-end \p reverse_iterator of a given range points to the element
+ *  preceding the first element of the input range. By the same token, the first
+ *  \p reverse_iterator of a given range is constructed from a past-the-end iterator
+ *  of the original range yet points to the last element of the input.
+ *
+ *  The following code snippet demonstrates how to create a \p reverse_iterator
+ *  which represents a reversed view of the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/reverse_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> v(4);
+ *  v[0] = 0.0f;
+ *  v[1] = 1.0f;
+ *  v[2] = 2.0f;
+ *  v[3] = 3.0f;
+ *
+ *  typedef thrust::device_vector<float>::iterator Iterator;
+ *
+ *  // note that we point the iterator to the *end* of the device_vector
+ *  thrust::reverse_iterator<Iterator> iter(values.end());
+ *
+ *  *iter;   // returns 3.0f;
+ *  iter[0]; // returns 3.0f;
+ *  iter[1]; // returns 2.0f;
+ *  iter[2]; // returns 1.0f;
+ *  iter[3]; // returns 0.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *  \endcode
+ *
+ *  Since reversing a range is a common operation, containers like \p device_vector
+ *  have nested typedefs for declaration shorthand and methods for constructing
+ *  reverse_iterators. The following code snippet is equivalent to the previous:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<float> v(4);
+ *  v[0] = 0.0f;
+ *  v[1] = 1.0f;
+ *  v[2] = 2.0f;
+ *  v[3] = 3.0f;
+ *
+ *  // we use the nested type reverse_iterator to refer to a reversed view of
+ *  // a device_vector and the method rbegin() to create a reverse_iterator pointing
+ *  // to the beginning of the reversed device_vector
+ *  thrust::device_iterator<float>::reverse_iterator iter = values.rbegin();
+ *
+ *  *iter;   // returns 3.0f;
+ *  iter[0]; // returns 3.0f;
+ *  iter[1]; // returns 2.0f;
+ *  iter[2]; // returns 1.0f;
+ *  iter[3]; // returns 0.0f;
+ *
+ *  // iter[4] is an out-of-bounds error
+ *
+ *  // similarly, rend() points to the end of the reversed sequence:
+ *  assert(values.rend() == (iter + 4));
+ *  \endcode
+ *
+ *  Finally, the following code snippet demonstrates how to use reverse_iterator to
+ *  perform a reversed prefix sum operation on the contents of a device_vector:
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/scan.h>
+ *  ...
+ *  thrust::device_vector<int> v(5);
+ *  v[0] = 0;
+ *  v[1] = 1;
+ *  v[2] = 2;
+ *  v[3] = 3;
+ *  v[4] = 4;
+ *
+ *  thrust::device_vector<int> result(5);
+ *
+ *  // exclusive scan v into result in reverse
+ *  thrust::exclusive_scan(v.rbegin(), v.rend(), result.begin());
+ *
+ *  // result is now {0, 4, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see make_reverse_iterator
+ */
+template<typename BidirectionalIterator>
+  class reverse_iterator
+    : public detail::reverse_iterator_base<BidirectionalIterator>::type
+{
+  /*! \cond
+   */
+  private:
+    typedef typename thrust::detail::reverse_iterator_base<
+      BidirectionalIterator
+    >::type super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Default constructor does nothing.
+     */
+    __host__ __device__
+    reverse_iterator(void) {}
+
+    /*! \p Constructor accepts a \c BidirectionalIterator pointing to a range
+     *  for this \p reverse_iterator to reverse.
+     *
+     *  \param x A \c BidirectionalIterator pointing to a range to reverse.
+     */
+    __host__ __device__
+    explicit reverse_iterator(BidirectionalIterator x);
+
+    /*! \p Copy constructor allows construction from a related compatible
+     *  \p reverse_iterator.
+     *
+     *  \param r A \p reverse_iterator to copy from.
+     */
+    template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
+// XXX msvc screws this up
+// XXX remove these guards when we have static_assert
+#ifndef _MSC_VER
+                     , typename thrust::detail::enable_if<
+                         thrust::detail::is_convertible<
+                           OtherBidirectionalIterator,
+                           BidirectionalIterator
+                         >::value
+                       >::type * = 0
+#endif // _MSC_VER
+                     );
+
+  /*! \cond
+   */
+  private:
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    typename super_t::reference dereference(void) const;
+
+    __host__ __device__
+    void increment(void);
+
+    __host__ __device__
+    void decrement(void);
+
+    __host__ __device__
+    void advance(typename super_t::difference_type n);
+
+    template<typename OtherBidirectionalIterator>
+    __host__ __device__
+    typename super_t::difference_type
+    distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const;
+  /*! \endcond
+   */
+}; // end reverse_iterator
+
+
+/*! \p make_reverse_iterator creates a \p reverse_iterator
+ *  from a \c BidirectionalIterator pointing to a range of elements to reverse.
+ *  
+ *  \param x A \c BidirectionalIterator pointing to a range to reverse.
+ *  \return A new \p reverse_iterator which reverses the range \p x.
+ */
+template<typename BidirectionalIterator>
+__host__ __device__
+reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x);
+
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/iterator/detail/reverse_iterator.inl>
+
diff --git a/compat/thrust/iterator/transform_iterator.h b/compat/thrust/iterator/transform_iterator.h
new file mode 100644
index 0000000..985b61b
--- /dev/null
+++ b/compat/thrust/iterator/transform_iterator.h
@@ -0,0 +1,344 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/transform_iterator.h
+ *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference 
+ */
+
+/*
+ * (C) Copyright David Abrahams 2002.
+ * (C) Copyright Jeremy Siek    2002.
+ * (C) Copyright Thomas Witt    2002.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the details first
+#include <thrust/iterator/detail/transform_iterator.inl>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_iterator is an iterator which represents a pointer into a range
+ *  of values after transformation by a function. This iterator is useful for 
+ *  creating a range filled with the result of applying an operation to another range
+ *  without either explicitly storing it in memory, or explicitly executing the transformation.
+ *  Using \p transform_iterator facilitates kernel fusion by deferring the execution
+ *  of a transformation until the value is needed while saving both memory capacity
+ *  and bandwidth.
+ *
+ *  The following code snippet demonstrates how to create a \p transform_iterator
+ *  which represents the result of \c sqrtf applied to the contents of a \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  // note: functor inherits from unary_function
+ *  struct square_root : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *  
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 4.0f;
+ *    v[2] = 9.0f;
+ *    v[3] = 16.0f;
+ *                                                                                           
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *                                                                                           
+ *    thrust::transform_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
+ *                                                                                           
+ *    *iter;   // returns 1.0f
+ *    iter[0]; // returns 1.0f;
+ *    iter[1]; // returns 2.0f;
+ *    iter[2]; // returns 3.0f;
+ *    iter[3]; // returns 4.0f;
+ *                                                                                           
+ *    // iter[4] is an out-of-bounds error
+ *  }
+ *  \endcode
+ *
+ *  This next example demonstrates how to use a \p transform_iterator with the
+ *  \p thrust::reduce function to compute the sum of squares of a sequence.
+ *  We will create temporary \p transform_iterators with the
+ *  \p make_transform_iterator function in order to avoid explicitly specifying their type:
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/reduce.h>
+ *  #include <iostream>
+ *  
+ *  // note: functor inherits from unary_function
+ *  struct square : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return x * x;
+ *    }
+ *  };
+ *  
+ *  int main(void)
+ *  {
+ *    // initialize a device array
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 2.0f;
+ *    v[2] = 3.0f;
+ *    v[3] = 4.0f;
+ *  
+ *    float sum_of_squares =
+ *     thrust::reduce(thrust::make_transform_iterator(v.begin(), square()),
+ *                    thrust::make_transform_iterator(v.end(),   square()));
+ *  
+ *    std::cout << "sum of squares: " << sum_of_squares << std::endl;
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  Note that in the previous two examples the transform functor (namely \c square_root 
+ *  and \c square) inherits from \c thrust::unary_function.  Inheriting from 
+ *  \c thrust::unary_function ensures that a functor is a valid \c AdaptableUnaryFunction
+ *  and provides all the necessary \c typedef declarations.  The \p transform_iterator
+ *  can also be applied to a \c UnaryFunction that does not inherit from 
+ *  \c thrust::unary_function using an optional template argument.  The following example
+ *  illustrates how to use the third template argument to specify the \c result_type of
+ *  the function.   
+ *
+ *  \code
+ *  #include <thrust/iterator/transform_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  // note: functor *does not* inherit from unary_function
+ *  struct square_root
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *  
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *    v[0] = 1.0f;
+ *    v[1] = 4.0f;
+ *    v[2] = 9.0f;
+ *    v[3] = 16.0f;
+ *                                                                                           
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *    
+ *    // note: float result_type is specified explicitly
+ *    thrust::transform_iterator<square_root, FloatIterator, float> iter(v.begin(), square_root());
+ *                                                                                           
+ *    *iter;   // returns 1.0f
+ *    iter[0]; // returns 1.0f;
+ *    iter[1]; // returns 2.0f;
+ *    iter[2]; // returns 3.0f;
+ *    iter[3]; // returns 4.0f;
+ *                                                                                           
+ *    // iter[4] is an out-of-bounds error
+ *  }
+ *  \endcode
+ *
+ *  \see make_transform_iterator
+ */
+template <class AdaptableUnaryFunction, class Iterator, class Reference = use_default, class Value = use_default>
+  class transform_iterator
+    : public detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
+{
+  /*! \cond
+   */
+  public:
+    typedef typename
+    detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  public:
+    /*! Null constructor does nothing.
+     */
+    __host__ __device__
+    transform_iterator() {}
+  
+    /*! This constructor takes as arguments an \c Iterator and an \c AdaptableUnaryFunction
+     *  and copies them to a new \p transform_iterator.
+     *
+     *  \param x An \c Iterator pointing to the input to this \p transform_iterator's \c AdaptableUnaryFunction.
+     *  \param f An \c AdaptableUnaryFunction used to transform the objects pointed to by \p x.
+     */
+    __host__ __device__
+    transform_iterator(Iterator const& x, AdaptableUnaryFunction f)
+      : super_t(x), m_f(f) {
+    }
+  
+    /*! This explicit constructor copies the value of a given \c Iterator and creates
+     *  this \p transform_iterator's \c AdaptableUnaryFunction using its null constructor.
+     *
+     *  \param x An \c Iterator to copy.
+     */
+    __host__ __device__
+    explicit transform_iterator(Iterator const& x)
+      : super_t(x) { }
+
+    /*! This copy constructor creates a new \p transform_iterator from another
+     *  \p transform_iterator.
+     *
+     *  \param other The \p transform_iterator to copy.
+     */
+    template<typename OtherAdaptableUnaryFunction,
+             typename OtherIterator,
+             typename OtherReference,
+             typename OtherValue>
+    __host__ __device__
+    transform_iterator(const transform_iterator<OtherAdaptableUnaryFunction, OtherIterator, OtherReference, OtherValue> &other,
+                       typename thrust::detail::enable_if_convertible<OtherIterator, Iterator>::type* = 0,
+                       typename thrust::detail::enable_if_convertible<OtherAdaptableUnaryFunction, AdaptableUnaryFunction>::type* = 0)
+      : super_t(other.base()), m_f(other.functor()) {}
+
+    /*! Copy assignment operator copies from another \p transform_iterator.
+     *  \p other The other \p transform_iterator to copy
+     *  \return <tt>*this</tt>
+     *
+     *  \note If the type of this \p transform_iterator's functor is not copy assignable
+     *        (for example, if it is a lambda) it is not an error to call this function.
+     *        In this case, however, the functor will not be modified.
+     *
+     *        In any case, this \p transform_iterator's underlying iterator will be copy assigned.
+     */
+    __host__ __device__
+    transform_iterator &operator=(const transform_iterator &other)
+    {
+      return do_assign(other,
+      // XXX gcc 4.2.1 crashes on is_copy_assignable; just assume the functor is assignable as a WAR
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION <= 40201)
+          thrust::detail::true_type()
+#else
+          typename thrust::detail::is_copy_assignable<AdaptableUnaryFunction>::type()
+#endif // THRUST_HOST_COMPILER
+      );
+    }
+
+    /*! This method returns a copy of this \p transform_iterator's \c AdaptableUnaryFunction.
+     *  \return A copy of this \p transform_iterator's \c AdaptableUnaryFunction.
+     */
+    __host__ __device__
+    AdaptableUnaryFunction functor() const
+      { return m_f; }
+
+    /*! \cond
+     */
+  private:
+    __host__ __device__
+    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::true_type)
+    {
+      super_t::operator=(other);
+
+      // do assign to m_f
+      m_f = other.functor();
+
+      return *this;
+    }
+
+    __host__ __device__
+    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::false_type)
+    {
+      super_t::operator=(other);
+
+      // don't assign to m_f
+
+      return *this;
+    }
+
+    __thrust_hd_warning_disable__
+    __host__ __device__
+    typename super_t::reference dereference() const
+    { 
+      // XXX consider making this a member instead of a temporary created inside dereference
+      thrust::detail::host_device_function<AdaptableUnaryFunction, typename super_t::reference> wrapped_f(m_f);
+
+      return wrapped_f(*this->base());
+    }
+
+    // tag this as mutable per Dave Abrahams in this thread:
+    // http://lists.boost.org/Archives/boost/2004/05/65332.php
+    mutable AdaptableUnaryFunction m_f;
+
+    /*! \endcond
+     */
+}; // end transform_iterator
+
+
+/*! \p make_transform_iterator creates a \p transform_iterator
+ *  from an \c Iterator and \c AdaptableUnaryFunction.
+ *
+ *  \param it The \c Iterator pointing to the input range of the
+ *            newly created \p transform_iterator.
+ *  \param fun The \c AdaptableUnaryFunction used to transform the range pointed
+ *             to by \p it in the newly created \p transform_iterator.
+ *  \return A new \p transform_iterator which transforms the range at
+ *          \p it by \p fun.
+ *  \see transform_iterator
+ */
+template <class AdaptableUnaryFunction, class Iterator>
+inline __host__ __device__
+transform_iterator<AdaptableUnaryFunction, Iterator>
+make_transform_iterator(Iterator it, AdaptableUnaryFunction fun)
+{
+  return transform_iterator<AdaptableUnaryFunction, Iterator>(it, fun);
+} // end make_transform_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/iterator/zip_iterator.h b/compat/thrust/iterator/zip_iterator.h
new file mode 100644
index 0000000..8e7299c
--- /dev/null
+++ b/compat/thrust/iterator/zip_iterator.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/iterator/zip_iterator.h
+ *  \brief An iterator which returns a tuple of the result of dereferencing
+ *         a tuple of iterators when dereferenced
+ */
+
+/*
+ * Copyright David Abrahams and Thomas Becker 2000-2006.
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/zip_iterator_base.h>
+#include <thrust/iterator/iterator_facade.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p zip_iterator is an iterator which represents a pointer into a range
+ *  of \p tuples whose elements are themselves taken from a \p tuple of input
+ *  iterators. This iterator is useful for creating a virtual array of structures
+ *  while achieving the same performance and bandwidth as the structure of arrays
+ *  idiom. \p zip_iterator also facilitates kernel fusion by providing a convenient
+ *  means of amortizing the execution of the same operation over multiple ranges.
+ *
+ *  The following code snippet demonstrates how to create a \p zip_iterator
+ *  which represents the result of "zipping" multiple ranges together.
+ *  
+ *  \code
+ *  #include <thrust/iterator/zip_iterator.h>
+ *  #include <thrust/tuple.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> int_v(3);
+ *  int_v[0] = 0; int_v[1] = 1; int_v[2] = 2;
+ *
+ *  thrust::device_vector<float> float_v(3);
+ *  float_v[0] = 0.0f; float_v[1] = 1.0;f float_v[2] = 2.0f;
+ *
+ *  thrust::device_vector<char> char_v(3);
+ *  char_v[0] = 'a'; char_v[1] = 'b'; char_v[2] = 'c';
+ *
+ *  // typedef these iterators for shorthand
+ *  typedef thrust::device_vector<int>::iterator   IntIterator;
+ *  typedef thrust::device_vector<float>::iterator FloatIterator;
+ *  typedef thrust::device_vector<char>::iterator  CharIterator;
+ *
+ *  // typedef a tuple of these iterators
+ *  typedef thrust::tuple<IntIterator, FloatIterator, CharIterator> IteratorTuple;
+ *
+ *  // typedef the zip_iterator of this tuple
+ *  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+ *
+ *  // finally, create the zip_iterator
+ *  ZipIterator iter(thrust::make_tuple(int_v.begin(), float_v.begin(), char_v.begin()));
+ *
+ *  *iter;   // returns (0, 0.0f, 'a')
+ *  iter[0]; // returns (0, 0.0f, 'a')
+ *  iter[1]; // returns (1, 1.0f, 'b')
+ *  iter[2]; // returns (2, 2.0f, 'c')
+ *
+ *  thrust::get<0>(iter[2]); // returns 2
+ *  thrust::get<1>(iter[0]); // returns 0.0f
+ *  thrust::get<2>(iter[1]); // returns 'b'
+ *
+ *  // iter[3] is an out-of-bounds error
+ *  \endcode
+ *
+ *  Defining the type of a \p zip_iterator can be complex. The next code example demonstrates
+ *  how to use the \p make_zip_iterator function with the \p make_tuple function to avoid
+ *  explicitly specifying the type of the \p zip_iterator. This example shows how to use
+ *  \p zip_iterator to copy multiple ranges with a single call to \p thrust::copy.
+ *
+ *  \code
+ *  #include <thrust/zip_iterator.h>
+ *  #include <thrust/tuple.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  int main(void)
+ *  {
+ *    thrust::device_vector<int> int_in(3), int_out(3);
+ *    int_in[0] = 0;
+ *    int_in[1] = 1;
+ *    int_in[2] = 2;
+ *
+ *    thrust::device_vector<float> float_in(3), float_out(3);
+ *    float_in[0] =  0.0f;
+ *    float_in[1] = 10.0f;
+ *    float_in[2] = 20.0f;
+ *
+ *    thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(int_in.begin(), float_in.begin())),
+ *                 thrust::make_zip_iterator(thrust::make_tuple(int_in.end(),   float_in.end())),
+ *                 thrust::make_zip_iterator(thrust::make_tuple(int_out.begin(),float_out.begin())));
+ *
+ *    // int_out is now [0, 1, 2]
+ *    // float_out is now [0.0f, 10.0f, 20.0f]
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_zip_iterator
+ *  \see make_tuple
+ *  \see tuple
+ *  \see get
+ */
+template <typename IteratorTuple>
+  class zip_iterator
+    : public detail::zip_iterator_base<IteratorTuple>::type
+{
+  public:
+    /*! Null constructor does nothing.
+     */
+    inline __host__ __device__
+    zip_iterator(void);
+
+    /*! This constructor creates a new \p zip_iterator from a
+     *  \p tuple of iterators.
+     *  
+     *  \param iterator_tuple The \p tuple of iterators to copy from.
+     */
+    inline __host__ __device__
+    zip_iterator(IteratorTuple iterator_tuple);
+
+    /*! This copy constructor creates a new \p zip_iterator from another
+     *  \p zip_iterator.
+     *
+     *  \param other The \p zip_iterator to copy.
+     */
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+    zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
+                 typename thrust::detail::enable_if_convertible<
+                   OtherIteratorTuple,
+                   IteratorTuple
+                 >::type * = 0);
+
+    /*! This method returns a \c const reference to this \p zip_iterator's
+     *  \p tuple of iterators.
+     *
+     *  \return A \c const reference to this \p zip_iterator's \p tuple
+     *          of iterators.
+     */
+    inline __host__ __device__
+    const IteratorTuple &get_iterator_tuple() const;
+
+    /*! \cond
+     */
+  private:
+    typedef typename
+    detail::zip_iterator_base<IteratorTuple>::type super_t;
+
+    friend class thrust::iterator_core_access;
+
+    // Dereferencing returns a tuple built from the dereferenced
+    // iterators in the iterator tuple.
+    __host__ __device__
+    typename super_t::reference dereference() const;
+
+    // Two zip_iterators are equal if the two first iterators of the
+    // tuple are equal. Note this differs from Boost's implementation, which
+    // considers the entire tuple.
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+    bool equal(const zip_iterator<OtherIteratorTuple> &other) const;
+
+    // Advancing a zip_iterator means to advance all iterators in the tuple
+    inline __host__ __device__
+    void advance(typename super_t::difference_type n);
+
+    // Incrementing a zip iterator means to increment all iterators in the tuple
+    inline __host__ __device__
+    void increment();
+
+    // Decrementing a zip iterator means to decrement all iterators in the tuple
+    inline __host__ __device__
+    void decrement();
+
+    // Distance is calculated using the first iterator in the tuple.
+    template<typename OtherIteratorTuple>
+    inline __host__ __device__
+      typename super_t::difference_type
+        distance_to(const zip_iterator<OtherIteratorTuple> &other) const;
+
+    // The iterator tuple.
+    IteratorTuple m_iterator_tuple;
+
+    /*! \endcond
+     */
+}; // end zip_iterator
+
+/*! \p make_zip_iterator creates a \p zip_iterator from a \p tuple
+ *  of iterators.
+ *
+ *  \param t The \p tuple of iterators to copy.
+ *  \return A newly created \p zip_iterator which zips the iterators encapsulated in \p t.
+ *
+ *  \see zip_iterator
+ */
+template<typename IteratorTuple>
+inline __host__ __device__
+zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+} // end thrust
+
+#include <thrust/iterator/detail/zip_iterator.inl>
+
diff --git a/compat/thrust/logical.h b/compat/thrust/logical.h
new file mode 100644
index 0000000..21510f3
--- /dev/null
+++ b/compat/thrust/logical.h
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file logical.h
+ *  \brief Logical operations on ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup logical
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p all_of determines whether all elements in a range satify a predicate.
+ *  Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
+ *  for every iterator \c i in the range <tt>[first, last)</tt> and 
+ *  \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::all_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::all_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::all_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
+ *  
+ *  \endcode
+ *
+ *  \see any_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p all_of determines whether all elements in a range satify a predicate.
+ * Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
+ * for every iterator \c i in the range <tt>[first, last)</tt> and 
+ * \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::all_of(A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::all_of(A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::all_of(A, A, thrust::identity<bool>()); // returns false
+ *  
+ *  \endcode
+ *
+ *  \see any_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template <typename InputIterator, typename Predicate>
+bool all_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p any_of determines whether any element in a range satifies a predicate.
+ *  Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
+ *  for any iterator \c i in the range <tt>[first, last)</tt> and 
+ *  \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::any_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::any_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  thrust::any_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::any_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+   
+
+/*! \p any_of determines whether any element in a range satifies a predicate.
+ * Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
+ * for any iterator \c i in the range <tt>[first, last)</tt> and 
+ * \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::any_of(A, A + 2, thrust::identity<bool>()); // returns true
+ *  thrust::any_of(A, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  thrust::any_of(A + 2, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  // empty range
+ *  thrust::any_of(A, A, thrust::identity<bool>()); // returns false
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see none_of
+ *  \see transform_reduce
+ */
+template <typename InputIterator, typename Predicate>
+bool any_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p none_of determines whether no element in a range satifies a predicate.
+ *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
+ *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
+ *  and \c false otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::none_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns false
+ *  thrust::none_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  thrust::none_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  // empty range
+ *  thrust::none_of(thrust::host, A, A, thrust::identity<bool>()); // returns true
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see any_of
+ *  \see transform_reduce
+ */
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \p none_of determines whether no element in a range satifies a predicate.
+ *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
+ *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
+ *  and \c false otherwise.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param pred A predicate used to test range elements.
+ *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \code
+ *  #include <thrust/logical.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  bool A[3] = {true, true, false};
+ *
+ *  thrust::none_of(A, A + 2, thrust::identity<bool>()); // returns false
+ *  thrust::none_of(A, A + 3, thrust::identity<bool>()); // returns false
+ *
+ *  thrust::none_of(A + 2, A + 3, thrust::identity<bool>()); // returns true
+ *
+ *  // empty range
+ *  thrust::none_of(A, A, thrust::identity<bool>()); // returns true
+ *  \endcode
+ *
+ *  \see all_of
+ *  \see any_of
+ *  \see transform_reduce
+ */
+template <typename InputIterator, typename Predicate>
+bool none_of(InputIterator first, InputIterator last, Predicate pred);
+
+
+/*! \} // end logical
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/logical.inl>
+
diff --git a/compat/thrust/memory.h b/compat/thrust/memory.h
new file mode 100644
index 0000000..6362de4
--- /dev/null
+++ b/compat/thrust/memory.h
@@ -0,0 +1,538 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/memory.h
+ *  \brief Abstractions for Thrust's memory model.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/malloc_and_free.h>
+#include <thrust/detail/temporary_buffer.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes Memory Management Classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p pointer stores a pointer to an object allocated in memory. Like \p device_ptr, this
+ *  type ensures type safety when dispatching standard algorithms on ranges resident in memory.
+ *
+ *  \p pointer generalizes \p device_ptr by relaxing the backend system associated with the \p pointer.
+ *  Instead of the backend system specified by \p THRUST_DEFAULT_DEVICE_BACKEND, \p pointer's
+ *  system is given by its second template parameter, \p Tag. For the purpose of Thrust dispatch,
+ *  <tt>device_ptr<Element></tt> and <tt>pointer<Element,device_system_tag></tt> are considered equivalent.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained through its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast free function.
+ *
+ *  \tparam Element specifies the type of the pointed-to object.
+ *
+ *  \tparam Tag specifies the system with which this \p pointer is associated. This may be any Thrust
+ *          backend system, or a user-defined tag.
+ *
+ *  \tparam Reference allows the client to specify the reference type returned upon derereference.
+ *          By default, this type is <tt>reference<Element,pointer></tt>.
+ *
+ *  \tparam Derived allows the client to specify the name of the derived type when \p pointer is used as
+ *          a base class. This is useful to ensure that arithmetic on values of the derived type return
+ *          values of the derived type as a result. By default, this type is <tt>pointer<Element,Tag,Reference></tt>.
+ *
+ *  \note \p pointer is not a smart pointer; it is the client's responsibility to deallocate memory
+ *        pointer to by \p pointer.
+ *
+ *  \see device_ptr
+ *  \see reference
+ *  \see raw_pointer_cast
+ */
+// define pointer for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+template<typename Element, typename Tag, typename Reference = thrust::use_default, typename Derived = thrust::use_default>
+  class pointer
+{
+  public:
+    /*! The type of the raw pointer
+     */
+    typedef typename super_t::base_type raw_pointer;
+    
+    /*! \p pointer's default constructor initializes its encapsulated pointer to \c 0
+     */
+    __host__ __device__
+    pointer();
+
+    /*! This constructor allows construction of a <tt>pointer<const T, ...></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in \p Tag's memory.
+     *  \tparam OtherElement \p OtherElement shall be convertible to \p Element.
+     */
+    template<typename OtherElement>
+    __host__ __device__
+    explicit pointer(OtherElement *ptr);
+
+    /*! This contructor allows initialization from another pointer-like object.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *
+     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
+     *                       and its element type shall be convertible to \p Element.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \return <tt>*this</tt>
+     *
+     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
+     *                       and its element type shall be convertible to \p Element.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      derived_type &
+    >::type
+    operator=(const OtherPointer &other);
+
+    /*! \p get returns this \p pointer's encapsulated raw pointer.
+     *  \return This \p pointer's raw pointer.
+     */
+    __host__ __device__
+    Element *get() const;
+};
+#endif
+
+/*! \p reference is a wrapped reference to an object stored in memory. \p reference generalizes
+ *  \p device_reference by relaxing the type of pointer associated with the object. \p reference
+ *  is the type of the result of dereferencing a tagged pointer-like object such as \p pointer, and
+ *  intermediates operations on objects existing in a remote memory.
+ *
+ *  \tparam Element specifies the type of the referent object.
+ *  \tparam Pointer specifies the type of the result of taking the address of \p reference.
+ *  \tparam Derived allows the client to specify the name of the derived type when \p reference is used as
+ *          a base class. This is useful to ensure that assignment to objects of the derived type return
+ *          values of the derived type as a result. By default, this type is <tt>reference<Element,Pointer></tt>.
+ */
+// define pointer for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
+template<typename Element, typename Pointer, typename Derived = thrust::use_default>
+  class reference
+{
+  public:
+    /*! The type of this \p reference's wrapped pointers.
+     */
+    typedef Pointer                                              pointer;
+
+    /*! The \p value_type of this \p reference.
+     */
+    typedef typename thrust::detail::remove_const<Element>::type value_type;
+
+    /*! This copy constructor initializes this \p reference
+     *  to refer to an object pointed to by the given \p pointer. After
+     *  this \p reference is constructed, it shall refer to the
+     *  object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr);
+
+    /*! This copy constructor accepts a const reference to another
+     *  \p reference of related type. After this \p reference is constructed,
+     *  it shall refer to the same object as \p other.
+     *  
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherElement the element type of the other \p reference.
+     *  \tparam OtherPointer the pointer type of the other \p reference.
+     *  \tparam OtherDerived the derived type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of 
+     *  <tt>reference<const T,...></tt> from <tt>reference<T,...></tt>.
+     */
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
+                pointer
+              >::type * = 0);
+
+    /*! Copy assignment operator copy assigns from another \p reference.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>
+     */
+    __host__ __device__
+    derived_type &operator=(const reference &other);
+
+    /*! Assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>
+     *
+     *  \tparam OtherElement the element type of the other \p reference.
+     *  \tparam OtherPointer the pointer type of the other \p reference.
+     *  \tparam OtherDerived the derived type of the other \p reference.
+     */
+    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
+    __host__ __device__
+    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>static_cast<derived_type&>(*this)</tt>.
+     */
+    __host__ __device__
+    derived_type &operator=(const value_type &x);
+
+    /*! Address-of operator returns a \p pointer pointing to the object
+     *  referenced by this \p reference. It does not return the address of this
+     *  \p reference.
+     *
+     *  \return A \p pointer pointing to the referenct object.
+     */
+    __host__ __device__
+    pointer operator&() const;
+
+    /*! Conversion operator converts this \p reference to \p value_type by
+     *  returning a copy of the referent object.
+     *  
+     *  \return A copy of the referent object.
+     */
+    __host__ __device__
+    operator value_type () const;
+
+    /*! Swaps the value of the referent object with another.
+     *
+     *  \param other The other \p reference with which to swap.
+     *  \note The argument is of type \p derived_type rather than \p reference.
+     */
+    __host__ __device__
+    void swap(derived_type &other);
+
+    /*! Prefix increment operator increments the referent object.
+     *
+     *  \return <tt>static_Cast<derived_type&>(*this)</tt>.
+     *
+     *  \note Documentation for other arithmetic operators omitted for brevity.
+     */
+    derived_type &operator++();
+};
+#endif
+
+/*! \}
+ */
+
+/*!
+ *  \addtogroup memory_management_functions Memory Management Functions
+ *  \ingroup memory_management
+ *  \{
+ */
+
+
+/*! \addtogroup allocation_functions
+ *  \{
+ */
+
+
+/*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The number of bytes of storage to allocate.
+ *  \return If allocation succeeds, a pointer to the allocated storage; a null pointer otherwise.
+ *          The pointer must be deallocated with \p thrust::free.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
+ *  associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate some memory with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<void,thrust::device_space_tag> void_ptr = thrust::malloc(device_sys, N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate void_ptr with thrust::free
+ *  thrust::free(device_sys, void_ptr);
+ *  \endcode
+ *
+ *  \see free
+ *  \see device_malloc
+ */
+template<typename DerivedPolicy>
+pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
+
+
+/*! This version of \p malloc allocates typed uninitialized storage associated with a given system.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The number of elements of type \c T which the storage should accomodate.
+ *  \return If allocation succeeds, a pointer to an allocation large enough to accomodate \c n
+ *          elements of type \c T; a null pointer otherwise.
+ *          The pointer must be deallocated with \p thrust::free.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
+ *  to accomodate integers associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
+ *
+ *  // manipulate memory
+ *  ...
+ *
+ *  // deallocate ptr with thrust::free
+ *  thrust::free(device_sys, ptr);
+ *  \endcode
+ *
+ *  \see free
+ *  \see device_malloc
+ */
+template<typename T, typename DerivedPolicy>
+pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
+
+
+/*! \p get_temporary_buffer returns a pointer to storage associated with a given Thrust system sufficient to store up to
+ *  \p n objects of type \c T. If not enough storage is available to accomodate \p n objects, an implementation may return
+ *  a smaller buffer. The number of objects the returned buffer can accomodate is also returned.
+ *
+ *  Thrust uses \p get_temporary_buffer internally when allocating temporary storage required by algorithm implementations.
+ *
+ *  The storage allocated with \p get_temporary_buffer must be returned to the system with \p return_temporary_buffer.
+ *
+ *  \param system The Thrust system with which to associate the storage.
+ *  \param n The requested number of objects of type \c T the storage should accomodate.
+ *  \return A pair \c p such that <tt>p.first</tt> is a pointer to the allocated storage and <tt>p.second</tt> is the number of
+ *          contiguous objects of type \c T that the storage can accomodate. If no storage can be allocated, <tt>p.first</tt> if
+ *          no storage can be obtained. The storage must be returned to the system using \p return_temporary_buffer.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *
+ *  The following code snippet demonstrates how to use \p get_temporary_buffer to allocate a range of memory
+ *  to accomodate integers associated with Thrust's device system.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::get_temporary_buffer
+ *  const int N = 100;
+ *
+ *  typedef thrust::pair<
+ *    thrust::pointer<int,thrust::device_system_tag>,
+ *    std::ptrdiff_t
+ *  > ptr_and_size_t;
+ *
+ *  thrust::device_system_tag device_sys;
+ *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
+ *
+ *  // manipulate up to 100 ints
+ *  for(int i = 0; i < ptr_and_size.second; ++i)
+ *  {
+ *    *ptr_and_size.first = i;
+ *  }
+ *
+ *  // deallocate storage with thrust::return_temporary_buffer
+ *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
+ *  \endcode
+ *
+ *  \see malloc
+ *  \see return_temporary_buffer
+ */
+template<typename T, typename DerivedPolicy>
+thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
+
+
+/*! \} allocation_functions
+ */
+
+
+/*! \addtogroup deallocation_functions
+ *  \{
+ */
+
+
+/*! \p free deallocates the storage previously allocated by \p thrust::malloc.
+ *
+ *  \param system The Thrust system with which the storage is associated.
+ *  \param ptr A pointer previously returned by \p thrust::malloc. If \p ptr is null, \p free
+ *         does nothing.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p ptr shall have been returned by a previous call to <tt>thrust::malloc(system, n)</tt> or <tt>thrust::malloc<T>(system, n)</tt> for some type \c T.
+ *
+ *  The following code snippet demonstrates how to use \p free to deallocate a range of memory
+ *  previously allocated with \p thrust::malloc.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::malloc
+ *  const int N = 100;
+ *  thrust::device_system_tag device_sys;
+ *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
+ *
+ *  // mainpulate memory
+ *  ...
+ *
+ *  // deallocate ptr with thrust::free
+ *  thrust::free(device_sys, ptr);
+ *  \endcode
+ */
+template<typename DerivedPolicy, typename Pointer>
+void free(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer ptr);
+
+
+/*! \p return_temporary_buffer deallocates storage associated with a given Thrust system previously allocated by \p get_temporary_buffer.
+ *
+ *  Thrust uses \p return_temporary_buffer internally when deallocating temporary storage required by algorithm implementations.
+ *
+ *  \param system The Thrust system with which the storage is associated.
+ *  \param p A pointer previously returned by \p thrust::get_temporary_buffer. If \p ptr is null, \p return_temporary_buffer does nothing.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *
+ *  \pre \p p shall have been previously allocated by \p thrust::get_temporary_buffer.
+ *
+ *  The following code snippet demonstrates how to use \p return_temporary_buffer to deallocate a range of memory
+ *  previously allocated by \p get_temporary_buffer.
+ *
+ *  \code
+ *  #include <thrust/memory.h>
+ *  ...
+ *  // allocate storage for 100 ints with thrust::get_temporary_buffer
+ *  const int N = 100;
+ *
+ *  typedef thrust::pair<
+ *    thrust::pointer<int,thrust::device_system_tag>,
+ *    std::ptrdiff_t
+ *  > ptr_and_size_t;
+ *
+ *  thrust::device_system_tag device_sys;
+ *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
+ *
+ *  // manipulate up to 100 ints
+ *  for(int i = 0; i < ptr_and_size.second; ++i)
+ *  {
+ *    *ptr_and_size.first = i;
+ *  }
+ *
+ *  // deallocate storage with thrust::return_temporary_buffer
+ *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
+ *  \endcode
+ *
+ *  \see free
+ *  \see get_temporary_buffer
+ */
+template<typename DerivedPolicy, typename Pointer>
+void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p);
+
+
+/*! \} deallocation_functions
+ */
+
+
+/*! \p raw_pointer_cast creates a "raw" pointer from a pointer-like type,
+ *  simply returning the wrapped pointer, should it exist.
+ *
+ *  \param ptr The pointer of interest.
+ *  \return <tt>ptr.get()</tt>, if the expression is well formed; <tt>ptr</tt>, otherwise.
+ *  \see raw_reference_cast
+ */
+template<typename Pointer>
+__host__ __device__
+inline typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+  raw_pointer_cast(const Pointer &ptr);
+
+
+/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
+ *  simply returning the underlying reference, should it exist.
+ *
+ *  If the argument is not a reference wrapper, the result is a reference to the argument.
+ *
+ *  \param ref The reference of interest.
+ *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
+ *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
+ *        and one for non-<tt>const</tt>.
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+__host__ __device__
+inline typename detail::raw_reference<T>::type
+  raw_reference_cast(T &ref);
+
+
+/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
+ *  simply returning the underlying reference, should it exist.
+ *
+ *  If the argument is not a reference wrapper, the result is a reference to the argument.
+ *
+ *  \param ref The reference of interest.
+ *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
+ *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
+ *        and one for non-<tt>const</tt>.
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+__host__ __device__
+inline typename detail::raw_reference<const T>::type
+  raw_reference_cast(const T &ref);
+
+
+/*! \}
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/merge.h b/compat/thrust/merge.h
new file mode 100644
index 0000000..e5fa7b4
--- /dev/null
+++ b/compat/thrust/merge.h
@@ -0,0 +1,676 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file merge.h
+ *  \brief Merging sorted ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup merging Merging
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sorted sets of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[13];
+ *
+ *  int *result_end =
+ *    thrust::merge(thrust::host,
+ *                  A1, A1 + 6,
+ *                  A2, A2 + 7,
+ *                  result);
+ *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p set_union
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sorted sets of integers.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result);
+ *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p set_union
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sets of integers sorted in
+ *  descending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(thrust::host,
+ *                                  A1, A1 + 6,
+ *                                  A2, A2 + 7,
+ *                                  result,
+ *                                  thrust::greater<int>());
+ *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp);
+
+
+/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
+ *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
+ *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
+ *  relative order of elements within each input range is preserved, and that for equivalent elements
+ *  in both input ranges the element from the first range precedes the element from the second. The
+ *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
+ *
+ *  This version of \p merge compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the merged output.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge to compute the merger of two sets of integers sorted in
+ *  descending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int result[13];
+ *
+ *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator merge(InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakCompare comp);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  ascending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
+ *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end =
+ *    thrust::merge_by_key(thrust::host,
+ *                         A_keys, A_keys + 6,
+ *                         B_keys, B_keys + 7,
+ *                         A_vals, B_vals,
+ *                         keys_result, vals_result);
+ *
+ *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  ascending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
+ *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result);
+ *
+ *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
+ *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  This version of \p merge_by_key compares key elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized using \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  descending order using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end =
+ *    thrust::merge_by_key(thrust::host,
+ *                         A_keys, A_keys + 6,
+ *                         B_keys, B_keys + 7,
+ *                         A_vals, B_vals,
+ *                         keys_result, vals_result,
+ *                         thrust::greater<int>());
+ *
+ *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp);
+
+
+/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
+ *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
+ *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending key order.
+ *
+ *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
+ *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
+ *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
+ *  the resulting range is in ascending order implied by each input element's associated key.
+ *
+ *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
+ *  preserved, and that for equivalent elements in all input key ranges the element from the first range
+ *  precedes the element from the second.
+ *
+ *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
+ *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
+ *
+ *  This version of \p merge_by_key compares key elements using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the merged output range of keys.
+ *  \param values_result The beginning of the merged output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use
+ *  \p merge_by_key to compute the merger of two sets of integers sorted in
+ *  descending order.
+ *
+ *  \code
+ *  #include <thrust/merge.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int keys_result[13];
+ *  int vals_result[13];
+ *
+ *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
+ *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
+ *  \endcode
+ *
+ *  \see merge
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 StrictWeakCompare comp);
+
+
+/*! \} // merging
+ */
+
+} // end thrust
+
+#include <thrust/detail/merge.inl>
+
diff --git a/compat/thrust/mismatch.h b/compat/thrust/mismatch.h
new file mode 100644
index 0000000..898157a
--- /dev/null
+++ b/compat/thrust/mismatch.h
@@ -0,0 +1,258 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file mismatch.h
+ *  \brief Search for differences between ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup searching
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ *  \p mismatch use different tests for whether elements differ.
+ *
+ *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ *  such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
+ *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ *  If no such iterator \c i exists, the return value is a \c pair whose first element
+ *  is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ * \p mismatch use different tests for whether elements differ.
+ *
+ * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ * such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
+ * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ * If no such iterator \c i exists, the return value is a \c pair whose first element
+ * is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template <typename InputIterator1, typename InputIterator2>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ *  \p mismatch use different tests for whether elements differ.
+ *
+ *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ *  such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
+ *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ *  If no such iterator \c i exists, the return value is a \c pair whose first element is
+ *  \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param pred   The binary predicate to compare elements.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                                      InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred);
+
+
+/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
+ * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
+ * \p mismatch use different tests for whether elements differ.
+ *
+ * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
+ * such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
+ * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
+ * If no such iterator \c i exists, the return value is a \c pair whose first element is
+ * \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
+ *
+ *  \param first1 The beginning of the first sequence.
+ *  \param last1  The end of the first sequence.
+ *  \param first2 The beginning of the second sequence.
+ *  \param pred   The binary predicate to compare elements.
+ *  \return The first position where the sequences differ.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *
+ *  \code
+ *  #include <thrust/mismatch.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec1(4);
+ *  thrust::device_vector<int> vec2(4);
+ *
+ *  vec1[0] = 0;  vec2[0] = 0; 
+ *  vec1[1] = 5;  vec2[1] = 5;
+ *  vec1[2] = 3;  vec2[2] = 8;
+ *  vec1[3] = 7;  vec2[3] = 7;
+ *
+ *  typedef thrust::device_vector<int>::iterator Iterator;
+ *  thrust::pair<Iterator,Iterator> result;
+ *
+ *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
+ *
+ *  // result.first  is vec1.begin() + 2
+ *  // result.second is vec2.begin() + 2
+ *  \endcode
+ *
+ *  \see find
+ *  \see find_if
+ */
+template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
+                                                      InputIterator1 last1,
+                                                      InputIterator2 first2,
+                                                      BinaryPredicate pred);
+
+/*! \} // end searching
+ */
+
+} // end namespace thrust
+
+#include <thrust/detail/mismatch.inl>
+
diff --git a/compat/thrust/pair.h b/compat/thrust/pair.h
new file mode 100644
index 0000000..897cc07
--- /dev/null
+++ b/compat/thrust/pair.h
@@ -0,0 +1,283 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file pair.h
+ *  \brief A type encapsulating a heterogeneous pair of elements
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <utility>
+
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup pair
+ *  \{
+ */
+
+/*! \p pair is a generic data structure encapsulating a heterogeneous
+ *  pair of values.
+ *
+ *  \tparam T1 The type of \p pair's first object type.  There are no
+ *          requirements on the type of \p T1. <tt>T1</tt>'s type is
+ *          provided by <tt>pair::first_type</tt>.
+ *
+ *  \tparam T2 The type of \p pair's second object type.  There are no
+ *          requirements on the type of \p T2. <tt>T2</tt>'s type is
+ *          provided by <tt>pair::second_type</tt>.
+ */
+template <typename T1, typename T2>
+  struct pair
+{
+  /*! \p first_type is the type of \p pair's first object type.
+   */
+  typedef T1 first_type;
+
+  /*! \p second_type is the type of \p pair's second object type.
+   */
+  typedef T2 second_type;
+
+  /*! The \p pair's first object.
+   */
+  first_type first;
+
+  /*! The \p pair's second object.
+   */
+  second_type second;
+
+  /*! \p pair's default constructor constructs \p first
+   *  and \p second using \c first_type & \c second_type's
+   *  default constructors, respectively.
+   */
+  __host__ __device__ pair(void);
+
+  /*! This constructor accepts two objects to copy into this \p pair.
+   *
+   *  \param x The object to copy into \p first.
+   *  \param y The object to copy into \p second.
+   */
+  inline __host__ __device__
+  pair(const T1 &x, const T2 &y);
+
+  /*! This copy constructor copies from a \p pair whose types are
+   *  convertible to this \p pair's \c first_type and \c second_type,
+   *  respectively.
+   *
+   *  \param p The \p pair to copy from.
+   *
+   *  \tparam U1 is convertible to \c first_type.
+   *  \tparam U2 is convertible to \c second_type.
+   */
+  template <typename U1, typename U2>
+  inline __host__ __device__
+  pair(const pair<U1,U2> &p);
+
+  /*! This copy constructor copies from a <tt>std::pair</tt> whose types are
+   *  convertible to this \p pair's \c first_type and \c second_type,
+   *  respectively.
+   *
+   *  \param p The <tt>std::pair</tt> to copy from.
+   *
+   *  \tparam U1 is convertible to \c first_type.
+   *  \tparam U2 is convertible to \c second_type.
+   */
+  template <typename U1, typename U2>
+  inline __host__ __device__
+  pair(const std::pair<U1,U2> &p);
+
+  /*! \p swap swaps the elements of two <tt>pair</tt>s.
+   *  
+   *  \param p The other <tt>pair</tt> with which to swap.
+   */
+  inline __host__ __device__
+  void swap(pair &p);
+}; // end pair
+
+
+/*! This operator tests two \p pairs for equality.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>x.first == y.first && x.second == y.second</tt>.
+ *  
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for ascending ordering.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>x.first < y.first || (!(y.first < x.first) && x.second < y.second)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for inequality.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(x == y)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for descending ordering.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>y < x</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for ascending ordering or equivalence.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(y < x)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! This operator tests two pairs for descending ordering or equivalence.
+ *
+ *  \param x The first \p pair to compare.
+ *  \param y The second \p pair to compare.
+ *  \return \c true if and only if <tt>!(x < y)</tt>.
+ *
+ *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y);
+
+
+/*! \p swap swaps the contents of two <tt>pair</tt>s.
+ *
+ *  \param x The first \p pair to swap.
+ *  \param y The second \p pair to swap.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    void swap(pair<T1,T2> &x, pair<T1,T2> &y);
+
+
+/*! This convenience function creates a \p pair from two objects.
+ *
+ *  \param x The first object to copy from.
+ *  \param y The second object to copy from.
+ *  \return A newly-constructed \p pair copied from \p a and \p b.
+ *
+ *  \tparam T1 There are no requirements on the type of \p T1.
+ *  \tparam T2 There are no requirements on the type of \p T2.
+ */
+template <typename T1, typename T2>
+  inline __host__ __device__
+    pair<T1,T2> make_pair(T1 x, T2 y);
+
+
+/*! This convenience metafunction is included for compatibility with
+ *  \p tuple. It returns either the type of a \p pair's
+ *  \c first_type or \c second_type in its nested type, \c type.
+ *
+ *  \tparam N This parameter selects the member of interest.
+ *  \tparam T A \c pair type of interest.
+ */
+template<int N, typename T> struct tuple_element;
+
+
+/*! This convenience metafunction is included for compatibility with
+ *  \p tuple. It returns \c 2, the number of elements of a \p pair,
+ *  in its nested data member, \c value.
+ *
+ *  \tparam Pair A \c pair type of interest.
+ */
+template<typename Pair> struct tuple_size;
+
+
+/*! This convenience function returns a reference to either the first or
+ *  second member of a \p pair.
+ *
+ *  \param p The \p pair of interest.
+ *  \return \c p.first or \c p.second, depending on the template
+ *          parameter.
+ *
+ *  \tparam N This parameter selects the member of interest.
+ */
+// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
+//template<unsigned int N, typename T1, typename T2>
+//  inline __host__ __device__
+//    typename tuple_element<N, pair<T1,T2> >::type &
+//      get(pair<T1,T2> &p);
+
+
+/*! This convenience function returns a const reference to either the
+ *  first or second member of a \p pair.
+ *
+ *  \param p The \p pair of interest.
+ *  \return \c p.first or \c p.second, depending on the template
+ *          parameter.
+ *
+ *  \tparam i This parameter selects the member of interest.
+ */
+// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
+//template<int N, typename T1, typename T2>
+//  inline __host__ __device__
+//    const typename tuple_element<N, pair<T1,T2> >::type &
+//      get(const pair<T1,T2> &p);
+
+/*! \} // pair
+ */
+
+/*! \} // utility
+ */
+
+} // end thrust
+
+#include <thrust/detail/pair.inl>
+
diff --git a/compat/thrust/partition.h b/compat/thrust/partition.h
new file mode 100644
index 0000000..61a6278
--- /dev/null
+++ b/compat/thrust/partition.h
@@ -0,0 +1,1429 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Reorganizes a range based on a predicate
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reordering
+ *  \ingroup algorithms
+ *
+ *  \addtogroup partitioning
+ *  \ingroup reordering
+ *  \{
+ */
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred, such that all of the elements that satisfy \p pred precede the
+ *  elements that fail to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
+ *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
+ *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
+ *  \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition, does guarantee to preserve the relative order.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(thrust::host,
+ *                    A, A + N,
+ *                    is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred, such that all of the elements that satisfy \p pred precede the
+ *  elements that fail to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
+ *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
+ *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
+ *  \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition, does guarantee to preserve the relative order.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(A, A + N,
+ *                     is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
+ *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition, does guarantee to preserve the relative order.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(thrust::host, A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
+ *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
+ *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  Note that the relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition, does guarantee to preserve the relative order.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::partition(A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see \p stable_partition
+ *  \see \p partition_copy
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p partition_copy differs from \ref partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input range shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::partition_copy(thrust::host, A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \ref partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input range shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::partition_copy(A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \ref partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution
+ *  policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p partition_copy differs from \ref partition only in that the reordered
+ *  sequence is written to difference output sequences, rather than in place.
+ *
+ *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \param first The beginning of the sequence to reorder.
+ *  \param last The end of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p partition_copy to separate a
+ *  sequence into two output sequences of even and odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \note The relative order of elements in the two reordered sequences is not
+ *  necessarily the same as it was in the original sequence. A different algorithm,
+ *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p stable_partition_copy
+ *  \see \p partition
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+
+/*! \p stable_partition is much like \ref partition : it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
+ *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
+ *  it. The postcondition is that, for some iterator \p middle in the range
+ *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
+ *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
+ *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
+ *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
+ *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(thrust::host,
+ *                           A, A + N,
+ *                           is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \ref partition : it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
+ *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
+ *  it. The postcondition is that, for some iterator \p middle in the range
+ *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
+ *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
+ *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
+ *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
+ *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements which do not satisfy pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(A, A + N,
+ *                            is_even());
+ *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition: it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
+ *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
+ *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(thrust::host, A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+
+/*! \p stable_partition is much like \p partition: it reorders the elements in the
+ *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
+ *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
+ *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
+ *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
+ *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
+ *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
+ *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
+ *  The return value of \p stable_partition is \c middle.
+ *
+ *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
+ *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
+ *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return An iterator referring to the first element of the second partition, that is,
+ *          the sequence of the elements whose stencil elements do not satisfy \p pred.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition to reorder a
+ *  sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::stable_partition(A, A + N, S, is_even());
+ *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  // S is unmodified
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see \p partition
+ *  \see \p stable_partition_copy
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred. All of the elements that satisfy \p pred are copied
+ *  to the range beginning at \p out_true and all the elements that fail to satisfy it
+ *  are copied to the range beginning at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
+ *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, evens, odds, is_even());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+ *  sequence is written to different output sequences, rather than in place.
+ *
+ *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
+ *  function object \p pred which is applied to a range of stencil elements. All of the elements
+ *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
+ *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
+ *  at \p out_false.
+ *
+ *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
+ *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
+ *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
+ *  after \p stable_partition_copy that \c x precedes \c y in the output.
+ *
+ *  \param first The first element of the sequence to reorder.
+ *  \param last One position past the last element of the sequence to reorder.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
+ *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
+ *  \param pred A function object which decides to which partition each element of the
+ *              sequence <tt>[first, last)</tt> belongs.
+ *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
+ *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
+ *          \p out_false.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap with either output range.
+ *
+ *  The following code snippet demonstrates how to use \p stable_partition_copy to
+ *  reorder a sequence so that even numbers precede odd numbers.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  int *evens = result;
+ *  int *odds  = result + 5;
+ *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
+ *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
+ *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
+ *  // evens points to {2, 4, 6, 8, 10}
+ *  // odds points to {1, 3, 5, 7, 9}
+ *  \endcode
+ *
+ *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
+ *  \see \p partition_copy
+ *  \see \p stable_partition
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+/*! \} // end reordering
+ */
+
+/*! \addtogroup searching
+ *  \{
+ */
+
+
+/*! \p partition_point returns an iterator pointing to the end of the true
+ *  partition of a partitioned range. \p partition_point requires the input range
+ *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
+ *  <tt>pred</tt> shall appear before those that do not.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *              range <tt>[first, last)</tt> belongs.
+ *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
+ *          and <tt>none_of(mid, last, pred)</tt> are both true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
+ *
+ *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
+ *        \p partition_point's precondition provides an opportunity for a
+ *        faster implemention.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int * B = thrust::partition_point(thrust::host, A, A + 10, is_even());
+ *  // B - A is 5
+ *  // [A, B) contains only even values
+ *  \endcode
+ *
+ *  \see \p partition
+ *  \see \p find_if_not
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+
+/*! \p partition_point returns an iterator pointing to the end of the true
+ *  partition of a partitioned range. \p partition_point requires the input range
+ *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
+ *  <tt>pred</tt> shall appear before those that do not.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *              range <tt>[first, last)</tt> belongs.
+ *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
+ *          and <tt>none_of(mid, last, pred)</tt> are both true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
+ *
+ *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
+ *        \p partition_point's precondition provides an opportunity for a
+ *        faster implemention.
+ *
+ *  \code
+ *  #include <thrust/partition.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int * B = thrust::partition_point(A, A + 10, is_even());
+ *  // B - A is 5
+ *  // [A, B) contains only even values
+ *  \endcode
+ *
+ *  \see \p partition
+ *  \see \p find_if_not
+ */
+template<typename ForwardIterator, typename Predicate>
+  ForwardIterator partition_point(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+/*! \} // searching
+ */
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup predicates
+ *  \{
+ */
+
+
+/*! \p is_partitioned returns \c true if the given range 
+ *  is partitioned with respect to a predicate, and \c false otherwise.
+ *
+ *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
+ *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
+ *  all elements that satisfy \p pred appear before those that do not.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *         range <tt>[first, last)</tt> belongs.
+ *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
+ *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  
+ *  \code
+ *  #include <thrust/partition.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *
+ *  thrust::is_partitioned(thrust::host, A, A + 10); // returns true
+ *  thrust::is_partitioned(thrust::host, B, B + 10); // returns false
+ *  \endcode
+ *
+ *  \see \p partition
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \p is_partitioned returns \c true if the given range 
+ *  is partitioned with respect to a predicate, and \c false otherwise.
+ *
+ *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
+ *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
+ *  all elements that satisfy \p pred appear before those that do not.
+ *
+ *  \param first The beginning of the range to consider.
+ *  \param last The end of the range to consider.
+ *  \param pred A function object which decides to which partition each element of the
+ *         range <tt>[first, last)</tt> belongs.
+ *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
+ *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  
+ *  \code
+ *  #include <thrust/partition.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int &x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *
+ *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
+ *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *
+ *  thrust::is_partitioned(A, A + 10); // returns true
+ *  thrust::is_partitioned(B, B + 10); // returns false
+ *  \endcode
+ *
+ *  \see \p partition
+ */
+template<typename InputIterator, typename Predicate>
+  bool is_partitioned(InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+
+/*! \} // end predicates
+ *  \} // end reductions
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/partition.inl>
+
diff --git a/compat/thrust/random.h b/compat/thrust/random.h
new file mode 100644
index 0000000..5a2c00d
--- /dev/null
+++ b/compat/thrust/random.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file random.h
+ *  \brief Pseudo-random number generators.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+
+// RNGs
+#include <thrust/random/discard_block_engine.h>
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/linear_feedback_shift_engine.h>
+#include <thrust/random/subtract_with_carry_engine.h>
+#include <thrust/random/xor_combine_engine.h>
+
+// distributions
+#include <thrust/random/uniform_int_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/random/normal_distribution.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup random Random Number Generation
+ *  \{
+ */
+
+
+/*! \namespace thrust::random
+ *  \brief \p thrust::random is the namespace which contains random number engine class templates,
+ *  random number engine adaptor class templates, engines with predefined parameters,
+ *  and random number distribution class templates. They are provided in a separate namespace
+ *  for import convenience but are also aliased in the top-level \p thrust namespace for
+ *  easy access.
+ */
+namespace random
+{
+
+/*! \addtogroup predefined_random Random Number Engines with Predefined Parameters
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \typedef ranlux24
+ *  \brief A random number engine with predefined parameters which implements the
+ *         RANLUX level-3 random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24
+ *        shall produce the value \c 9901578 .
+ */
+typedef discard_block_engine<ranlux24_base, 223, 23> ranlux24;
+
+
+/*! \typedef ranlux48
+ *  \brief A random number engine with predefined parameters which implements the
+ *         RANLUX level-4 random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48
+ *        shall produce the value \c 88229545517833 .
+ */
+typedef discard_block_engine<ranlux48_base, 389, 11> ranlux48;
+
+
+/*! \typedef taus88
+ *  \brief A random number engine with predefined parameters which implements
+ *         L'Ecuyer's 1996 three-component Tausworthe random number generator.
+ *
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p taus88
+ *        shall produce the value \c 3535848941 .
+ */
+typedef xor_combine_engine<
+  linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 31u, 13u, 12u>,
+  0,
+  xor_combine_engine<
+    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 29u,  2u,  4u>, 0,
+    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 28u,  3u, 17u>, 0
+  >,
+  0
+> taus88;
+
+/*! \typedef default_random_engine
+ *  \brief An implementation-defined "default" random number engine.
+ *  \note \p default_random_engine is currently an alias for \p minstd_rand, and may change
+ *        in a future version.
+ */
+typedef minstd_rand default_random_engine;
+
+/*! \} // end predefined_random
+ */
+
+} // end random
+
+
+/*! \} // end random
+ */
+
+// import names into thrust::
+using random::ranlux24;
+using random::ranlux48;
+using random::taus88;
+using random::default_random_engine;
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/discard_block_engine.inl b/compat/thrust/random/detail/discard_block_engine.inl
new file mode 100644
index 0000000..5f01bd1
--- /dev/null
+++ b/compat/thrust/random/detail/discard_block_engine.inl
@@ -0,0 +1,201 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/discard_block_engine.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename Engine, size_t p, size_t r>
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine()
+      : m_e(), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine(result_type s)
+      : m_e(s), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  discard_block_engine<Engine,p,r>
+    ::discard_block_engine(const base_type &urng)
+      : m_e(urng), m_n(0)
+{}
+
+
+template<typename Engine, size_t p, size_t r>
+  void discard_block_engine<Engine,p,r>
+    ::seed(void)
+{
+  m_e.seed();
+  m_n = 0;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  void discard_block_engine<Engine,p,r>
+    ::seed(result_type s)
+{
+  m_e.seed(s);
+  m_n = 0;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  typename discard_block_engine<Engine,p,r>::result_type
+    discard_block_engine<Engine,p,r>
+      ::operator()(void)
+{
+  if(m_n >= used_block)
+  {
+    m_e.discard(block_size - m_n);
+//    for(; m_n < block_size; ++m_n)
+//      m_e();
+    m_n = 0;
+  }
+
+  ++m_n;
+
+  return m_e();
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  void discard_block_engine<Engine,p,r>
+    ::discard(unsigned long long z)
+{
+  // XXX this should be accelerated
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  const typename discard_block_engine<Engine,p,r>::base_type &
+    discard_block_engine<Engine,p,r>
+      ::base(void) const
+{
+  return m_e;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& discard_block_engine<Engine,p,r>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  // output the base engine followed by n
+  os << m_e << space << m_n;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& discard_block_engine<Engine,p,r>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input the base engine and then n
+  is >> m_e >> m_n;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename Engine, size_t p, size_t r>
+  bool discard_block_engine<Engine,p,r>
+    ::equal(const discard_block_engine<Engine,p,r> &rhs) const
+{
+  return (m_e == rhs.m_e) && (m_n == rhs.m_n);
+}
+
+
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const discard_block_engine<Engine,p,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           discard_block_engine<Engine,p,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename Engine, size_t p, size_t r>
+bool operator==(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename Engine, size_t p, size_t r>
+bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/linear_congruential_engine.inl b/compat/thrust/random/detail/linear_congruential_engine.inl
new file mode 100644
index 0000000..f040563
--- /dev/null
+++ b/compat/thrust/random/detail/linear_congruential_engine.inl
@@ -0,0 +1,163 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/detail/mod.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  linear_congruential_engine<UIntType,a,c,m>
+    ::linear_congruential_engine(result_type s)
+{
+  seed(s);
+} // end linear_congruential_engine::linear_congruential_engine()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  void linear_congruential_engine<UIntType,a,c,m>
+    ::seed(result_type s)
+{
+  if((detail::mod<UIntType, 1, 0, m>(c) == 0) &&
+     (detail::mod<UIntType, 1, 0, m>(s) == 0))
+    m_x = detail::mod<UIntType, 1, 0, m>(1);
+  else
+    m_x = detail::mod<UIntType, 1, 0, m>(s);
+} // end linear_congruential_engine::seed()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  typename linear_congruential_engine<UIntType,a,c,m>::result_type
+    linear_congruential_engine<UIntType,a,c,m>
+      ::operator()(void)
+{
+  m_x = detail::mod<UIntType,a,c,m>(m_x);
+  return m_x;
+} // end linear_congruential_engine::operator()()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  void linear_congruential_engine<UIntType,a,c,m>
+    ::discard(unsigned long long z)
+{
+  thrust::random::detail::linear_congruential_engine_discard::discard(*this,z);
+} // end linear_congruential_engine::discard()
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(os.widen(' '));
+
+  // output one word of state
+  os << m_x;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::dec);
+
+  // input one word of state
+  is >> m_x;
+
+  // restore flags
+  is.flags(flags);
+
+  return is;
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+bool linear_congruential_engine<UIntType,a,c,m>
+  ::equal(const linear_congruential_engine<UIntType,a,c,m> &rhs) const
+{
+  return m_x == rhs.m_x;
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs)
+{
+  return detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+bool operator!=(const linear_congruential_engine<UIntType,a,c,m> &lhs,
+                const linear_congruential_engine<UIntType,a,c,m> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_congruential_engine<UIntType_,a_,c_,m_> &e)
+{
+  return detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_congruential_engine<UIntType_,a_,c_,m_> &e)
+{
+  return detail::random_core_access::stream_in(is,e);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/linear_congruential_engine_discard.h b/compat/thrust/random/detail/linear_congruential_engine_discard.h
new file mode 100644
index 0000000..f4ec233
--- /dev/null
+++ b/compat/thrust/random/detail/linear_congruential_engine_discard.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/mod.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+
+template<typename UIntType, UIntType a, unsigned long long c, UIntType m>
+  struct linear_congruential_engine_discard_implementation
+{
+  __host__ __device__
+  static void discard(UIntType &state, unsigned long long z)
+  {
+    for(; z > 0; --z)
+    {
+      state = detail::mod<UIntType,a,c,m>(state);
+    }
+  }
+}; // end linear_congruential_engine_discard
+
+
+// specialize for small integers and c == 0
+// XXX figure out a robust implemenation of this for any unsigned integer type later
+template<thrust::detail::uint32_t a, thrust::detail::uint32_t m>
+  struct linear_congruential_engine_discard_implementation<thrust::detail::uint32_t,a,0,m>
+{
+  __host__ __device__
+  static void discard(thrust::detail::uint32_t &state, unsigned long long z)
+  {
+    const thrust::detail::uint32_t modulus = m;
+
+    // XXX we need to use unsigned long long here or we will encounter overflow in the
+    //     multiplies below
+    //     figure out a robust implementation of this later
+    unsigned long long multiplier = a;
+    unsigned long long multiplier_to_z = 1;
+    
+    // see http://en.wikipedia.org/wiki/Modular_exponentiation
+    while(z > 0)
+    {
+      if(z & 1)
+      {
+        // multiply in this bit's contribution while using modulus to keep result small
+        multiplier_to_z = (multiplier_to_z * multiplier) % modulus;
+      }
+
+      // move to the next bit of the exponent, square (and mod) the base accordingly
+      z >>= 1;
+      multiplier = (multiplier * multiplier) % modulus;
+    }
+
+    state = static_cast<thrust::detail::uint32_t>((multiplier_to_z * state) % modulus);
+  }
+}; // end linear_congruential_engine_discard
+
+
+struct linear_congruential_engine_discard
+{
+  template<typename LinearCongruentialEngine>
+  __host__ __device__
+  static void discard(LinearCongruentialEngine &lcg, unsigned long long z)
+  {
+    typedef typename LinearCongruentialEngine::result_type result_type;
+    const result_type c = LinearCongruentialEngine::increment;
+    const result_type a = LinearCongruentialEngine::multiplier;
+    const result_type m = LinearCongruentialEngine::modulus;
+    
+    // XXX WAR unused variable warnings
+    (void) c;
+    (void) a;
+    (void) m;
+
+    linear_congruential_engine_discard_implementation<result_type,a,c,m>::discard(lcg.m_x, z);
+  }
+}; // end linear_congruential_engine_discard
+
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/linear_feedback_shift_engine.inl b/compat/thrust/random/detail/linear_feedback_shift_engine.inl
new file mode 100644
index 0000000..4e8dad5
--- /dev/null
+++ b/compat/thrust/random/detail/linear_feedback_shift_engine.inl
@@ -0,0 +1,158 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_feedback_shift_engine.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::linear_feedback_shift_engine(result_type value)
+{
+  seed(value);
+} // end linear_feedback_shift_engine::linear_feedback_shift_engine()
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  void linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::seed(result_type value)
+{
+  m_value = value;
+} // end linear_feedback_shift_engine::seed()
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  typename linear_feedback_shift_engine<UIntType,w,k,q,s>::result_type
+    linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::operator()(void)
+{
+  const UIntType b = (((m_value << q) ^ m_value) & wordmask) >> (k-s);
+  const UIntType mask = ( (~static_cast<UIntType>(0)) << (w-k) ) & wordmask;
+  m_value = ((m_value & mask) << s) ^ b;
+  return m_value;
+} // end linear_feedback_shift_engine::operator()()
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  void linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end linear_feedback_shift_engine::discard()
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags & fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(os.widen(' '));
+
+  // output one word of state
+  os << m_value;
+
+  // restore flags & fill character
+  os.flags(flags);
+  os.fill(fill);
+
+  return os;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input one word of state
+  is >> m_value;
+
+  // restore flags
+  is.flags(flags);
+
+  return is;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  bool linear_feedback_shift_engine<UIntType,w,k,q,s>
+    ::equal(const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs) const
+{
+  return m_value == rhs.m_value;
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+bool operator==(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
+                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+bool operator!=(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
+                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h b/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
new file mode 100644
index 0000000..ed9e51e
--- /dev/null
+++ b/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+template<typename T, int w, int i = w-1>
+  struct linear_feedback_shift_engine_wordmask
+{
+  static const T value =
+    (T(1u) << i) |
+    linear_feedback_shift_engine_wordmask<T, w, i-1>::value;
+}; // end linear_feedback_shift_engine_wordmask
+
+template<typename T, int w>
+  struct linear_feedback_shift_engine_wordmask<T, w, 0>
+{
+  static const T value = 0;
+}; // end linear_feedback_shift_engine_wordmask
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/mod.h b/compat/thrust/random/detail/mod.h
new file mode 100644
index 0000000..ceb2191
--- /dev/null
+++ b/compat/thrust/random/detail/mod.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+template<typename T, T a, T c, T m, bool = (m == 0)>
+  struct static_mod
+{
+  static const T q = m / a;
+  static const T r = m % a;
+
+  __host__ __device__
+  T operator()(T x) const
+  {
+    if(a == 1)
+    {
+      x %= m;
+    }
+    else
+    {
+      T t1 = a * (x % q);
+      T t2 = r * (x / q);
+      if(t1 >= t2)
+      {
+        x = t1 - t2;
+      }
+      else
+      {
+        x = m - t2 + t1;
+      }
+    }
+
+    if(c != 0)
+    {
+      const T d = m - x;
+      if(d > c)
+      {
+        x += c;
+      }
+      else
+      {
+        x = c - d;
+      }
+    }
+
+    return x;
+  }
+}; // end static_mod
+
+
+// Rely on machine overflow handling
+template<typename T, T a, T c, T m>
+  struct static_mod<T,a,c,m,true>
+{
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return a * x + c;
+  }
+}; // end static_mod
+
+template<typename T, T a, T c, T m>
+__host__ __device__
+  T mod(T x)
+{
+  static_mod<T,a,c,m> f;
+  return f(x);
+} // end static_mod
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/normal_distribution.inl b/compat/thrust/random/detail/normal_distribution.inl
new file mode 100644
index 0000000..1bb55d7
--- /dev/null
+++ b/compat/thrust/random/detail/normal_distribution.inl
@@ -0,0 +1,241 @@
+/*
+ *
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/normal_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/integer_traits.h>
+
+// for floating point infinity
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <math_constants.h>
+#else
+#include <limits>
+#endif
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename RealType>
+  normal_distribution<RealType>
+    ::normal_distribution(RealType a, RealType b)
+      :super_t(),m_param(a,b)
+{
+} // end normal_distribution::normal_distribution()
+
+
+template<typename RealType>
+  normal_distribution<RealType>
+    ::normal_distribution(const param_type &parm)
+      :super_t(),m_param(parm)
+{
+} // end normal_distribution::normal_distribution()
+
+
+template<typename RealType>
+  void normal_distribution<RealType>
+    ::reset(void)
+{
+  super_t::reset();
+} // end normal_distribution::reset()
+
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    typename normal_distribution<RealType>::result_type
+      normal_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end normal_distribution::operator()()
+
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    typename normal_distribution<RealType>::result_type
+      normal_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng,
+                     const param_type &parm)
+{
+  return super_t::sample(urng, parm.first, parm.second);
+} // end normal_distribution::operator()()
+
+
+template<typename RealType>
+  typename normal_distribution<RealType>::param_type
+    normal_distribution<RealType>
+      ::param(void) const
+{
+  return m_param;
+} // end normal_distribution::param()
+
+
+template<typename RealType>
+  void normal_distribution<RealType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end normal_distribution::param()
+
+
+template<typename RealType>
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return -this->max();
+} // end normal_distribution::min()
+
+
+template<typename RealType>
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  // XXX this solution is pretty terrible
+  // we can't use numeric_traits<RealType>::max because nvcc will
+  // complain that it is a __host__ function
+  union
+  {
+    thrust::detail::uint32_t inf_as_int;
+    float result;
+  } hack;
+
+  hack.inf_as_int = 0x7f800000u;
+
+  return hack.result;
+} // end normal_distribution::max()
+
+
+template<typename RealType>
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::mean(void) const
+{
+  return m_param.first;
+} // end normal_distribution::mean()
+
+
+template<typename RealType>
+  typename normal_distribution<RealType>::result_type
+    normal_distribution<RealType>
+      ::stddev(void) const
+{
+  return m_param.second;
+} // end normal_distribution::stddev()
+
+
+template<typename RealType>
+  bool normal_distribution<RealType>
+    ::equal(const normal_distribution &rhs) const
+{
+  return m_param == rhs.param();
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      normal_distribution<RealType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << mean() << space << stddev();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      normal_distribution<RealType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename RealType>
+bool operator==(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename RealType>
+bool operator!=(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const normal_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           normal_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/normal_distribution_base.h b/compat/thrust/random/detail/normal_distribution_base.h
new file mode 100644
index 0000000..d916611
--- /dev/null
+++ b/compat/thrust/random/detail/normal_distribution_base.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * Copyright Jens Maurer 2000-2001
+ * Distributed under the Boost Software License, Version 1.0. (See
+ * accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <limits>
+#include <cmath>
+
+namespace thrust
+{
+namespace random
+{
+namespace detail
+{
+
+// this version samples the normal distribution directly 
+// and uses the non-standard math function erfcinv
+template<typename RealType>
+  class normal_distribution_nvcc
+{
+  protected:
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
+    {
+      typedef typename UniformRandomNumberGenerator::result_type uint_type;
+      const uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
+
+      // Constants for conversion
+      const RealType S1 = static_cast<RealType>(1) / urng_range;
+      const RealType S2 = S1 / 2;
+
+      RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
+      
+      // Get the integer value
+      uint_type u = urng() - UniformRandomNumberGenerator::min;
+
+      // Ensure the conversion to float will give a value in the range [0,0.5)
+      if(u > (urng_range / 2))
+      {
+        u = urng_range - u;
+        S3 = -S3;
+      }
+
+      // Convert to floating point in [0,0.5)
+      RealType p = u*S1 + S2;
+
+      // Apply inverse error function
+      return mean + stddev * S3 * erfcinv(2 * p);
+    }
+
+    // no-op
+    __host__ __device__
+    void reset() {}
+};
+
+// this version samples the normal distribution using 
+// Marsaglia's "polar method"
+template<typename RealType>
+  class normal_distribution_portable
+{
+  protected:
+    normal_distribution_portable()
+      : m_valid(false)
+    {}
+
+    normal_distribution_portable(const normal_distribution_portable &other)
+      : m_valid(other.m_valid)
+    {}
+
+    void reset()
+    {
+      m_valid = false;
+    }
+
+    // note that we promise to call this member function with the same mean and stddev
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
+    {
+      // implementation from Boost
+      // allow for Koenig lookup
+      using std::sqrt; using std::log; using std::sin; using std::cos;
+
+      if(!m_valid)
+      {
+        uniform_real_distribution<RealType> u01;
+        m_r1 = u01(urng);
+        m_r2 = u01(urng);
+        m_cached_rho = sqrt(-RealType(2) * log(RealType(1)-m_r2));
+
+        m_valid = true;
+      }
+      else
+      {
+        m_valid = false;
+      }
+
+      const RealType pi = RealType(3.14159265358979323846);
+
+      RealType result = m_cached_rho * (m_valid ?
+                          cos(RealType(2)*pi*m_r1) :
+                          sin(RealType(2)*pi*m_r1));
+
+      return result;
+    }
+
+  private:
+    RealType m_r1, m_r2, m_cached_rho;
+    bool m_valid;
+};
+
+template<typename RealType>
+  struct normal_distribution_base
+{
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  typedef normal_distribution_nvcc<RealType> type;
+#else
+  typedef normal_distribution_portable<RealType> type;
+#endif
+};
+
+} // end detail
+} // end random
+} // end thrust
+
diff --git a/compat/thrust/random/detail/random_core_access.h b/compat/thrust/random/detail/random_core_access.h
new file mode 100644
index 0000000..81f58e2
--- /dev/null
+++ b/compat/thrust/random/detail/random_core_access.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+struct random_core_access
+{
+
+template<typename OStream, typename EngineOrDistribution>
+static OStream &stream_out(OStream &os, const EngineOrDistribution &x)
+{
+  return x.stream_out(os);
+}
+
+template<typename IStream, typename EngineOrDistribution>
+static IStream &stream_in(IStream &is, EngineOrDistribution &x)
+{
+  return x.stream_in(is);
+}
+
+template<typename EngineOrDistribution>
+__host__ __device__
+static bool equal(const EngineOrDistribution &lhs, const EngineOrDistribution &rhs)
+{
+  return lhs.equal(rhs);
+}
+
+}; // end random_core_access
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/subtract_with_carry_engine.inl b/compat/thrust/random/detail/subtract_with_carry_engine.inl
new file mode 100644
index 0000000..a58b266
--- /dev/null
+++ b/compat/thrust/random/detail/subtract_with_carry_engine.inl
@@ -0,0 +1,203 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/subtract_with_carry_engine.h>
+#include <thrust/random/detail/mod.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  subtract_with_carry_engine<UIntType,w,s,r>
+    ::subtract_with_carry_engine(result_type value)
+{
+  seed(value);
+} // end subtract_with_carry_engine::subtract_with_carry_engine()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  void subtract_with_carry_engine<UIntType,w,s,r>
+    ::seed(result_type value)
+{
+  thrust::random::linear_congruential_engine<result_type,
+    40014u, 0u, 2147483563u> e(value == 0u ? default_seed : value);
+
+  // initialize state
+  for(size_t i = 0; i < long_lag; ++i)
+  {
+    m_x[i] = detail::mod<UIntType, 1, 0, modulus>(e());
+  } // end for i
+
+  m_carry = (m_x[long_lag-1] == 0);
+  m_k = 0;
+} // end subtract_with_carry_engine::seed()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  typename subtract_with_carry_engine<UIntType,w,s,r>::result_type
+    subtract_with_carry_engine<UIntType,w,s,r>
+      ::operator()(void)
+{
+  // XXX we probably need to cache these m_x[m_k] in a register
+  //     maybe we need to cache the use of all member variables
+  int short_index = m_k - short_lag;
+  if(short_index < 0)
+    short_index += long_lag;
+  result_type xi;
+  if (m_x[short_index] >= m_x[m_k] + m_carry)
+  {
+    // x(n) >= 0
+    xi =  m_x[short_index] - m_x[m_k] - m_carry;
+    m_carry = 0;
+  }
+  else
+  {
+    // x(n) < 0
+    xi = modulus - m_x[m_k] - m_carry + m_x[short_index];
+    m_carry = 1;
+  }
+  m_x[m_k] = xi;
+  ++m_k;
+  if(m_k >= long_lag)
+    m_k = 0;
+  return xi;
+} // end subtract_with_carry_engine::operator()()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  void subtract_with_carry_engine<UIntType,w,s,r>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end subtract_with_carry_engine::discard()
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base     ios_base;
+                  
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill  = os.fill();
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  const UIntType long_lag = r;
+                                                          
+  for(size_t i = 0; i < r; ++i)
+    os << m_x[(i + m_k) % long_lag] << space;
+  os << m_carry;
+                                                                          
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  template<typename CharType, typename Traits>
+    std::basic_istream<CharType,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
+      ::stream_in(std::basic_istream<CharType,Traits> &is)
+{
+  typedef std::basic_istream<CharType,Traits> istream_type;
+  typedef typename istream_type::ios_base     ios_base;
+
+  const typename ios_base::fmtflags flags = is.flags();
+  is.flags(ios_base::dec | ios_base::skipws);
+
+  for(size_t i = 0; i < r; ++i)
+    is >> m_x[i];
+  is >> m_carry;
+
+  m_k = 0;
+
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  bool subtract_with_carry_engine<UIntType,w,s,r>
+    ::equal(const subtract_with_carry_engine<UIntType,w,s,r> &rhs) const
+{
+  const UIntType long_lag = r;
+
+  bool result = true;
+  for(size_t i = 0; i < r; ++i)
+  {
+    result &= (m_x[(i + m_k) % long_lag] == rhs.m_x[(i + rhs.m_k) % long_lag]);
+  }
+
+  // XXX not sure if this last check is necessary
+  result &= (m_carry == rhs.m_carry);
+
+  return result;
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r,
+         typename CharT, typename Traits>
+  std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits> &os,
+               const subtract_with_carry_engine<UIntType,w,s,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r,
+         typename CharType, typename Traits>
+  std::basic_istream<CharType,Traits>&
+    operator>>(std::basic_istream<CharType,Traits> &is,
+               subtract_with_carry_engine<UIntType,w,s,r> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  bool operator==(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
+                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename UIntType, size_t w, size_t s, size_t r>
+  bool operator!=(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
+                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/uniform_int_distribution.inl b/compat/thrust/random/detail/uniform_int_distribution.inl
new file mode 100644
index 0000000..e92754c
--- /dev/null
+++ b/compat/thrust/random/detail/uniform_int_distribution.inl
@@ -0,0 +1,232 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/uniform_int_distribution.h>
+#include <thrust/random/uniform_real_distribution.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename IntType>
+  uniform_int_distribution<IntType>
+    ::uniform_int_distribution(IntType a, IntType b)
+      :m_param(a,b)
+{
+} // end uniform_int_distribution::uniform_int_distribution()
+
+
+template<typename IntType>
+  uniform_int_distribution<IntType>
+    ::uniform_int_distribution(const param_type &parm)
+      :m_param(parm)
+{
+} // end uniform_int_distribution::uniform_int_distribution()
+
+
+template<typename IntType>
+  void uniform_int_distribution<IntType>
+    ::reset(void)
+{
+} // end uniform_int_distribution::reset()
+
+
+template<typename IntType>
+  template<typename UniformRandomNumberGenerator>
+    typename uniform_int_distribution<IntType>::result_type
+      uniform_int_distribution<IntType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end uniform_int_distribution::operator()()
+
+
+template<typename IntType>
+  template<typename UniformRandomNumberGenerator>
+    typename uniform_int_distribution<IntType>::result_type
+      uniform_int_distribution<IntType>
+        ::operator()(UniformRandomNumberGenerator &urng, const param_type &parm)
+{
+  // XXX this implementation is somewhat hacky and will skip
+  //     values if the range of the RNG is smaller than the range of the distribution
+  //     we should improve this implementation in a later version
+
+  typedef typename thrust::detail::largest_available_float::type float_type;
+
+  const float_type real_min(parm.first);
+  const float_type real_max(parm.second);
+
+  // add one to the right end of the interval because it is half-open
+  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
+  uniform_real_distribution<float_type> real_dist(real_min, real_max + float_type(1));
+
+  return static_cast<result_type>(real_dist(urng));
+} // end uniform_int_distribution::operator()()
+
+
+template<typename IntType>
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::a(void) const
+{
+  return m_param.first;
+} // end uniform_int_distribution<IntType>::a()
+
+
+template<typename IntType>
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::b(void) const
+{
+  return m_param.second;
+} // end uniform_int_distribution::b()
+
+
+template<typename IntType>
+  typename uniform_int_distribution<IntType>::param_type
+    uniform_int_distribution<IntType>
+      ::param(void) const
+{
+  return m_param;
+} // end uniform_int_distribution::param()
+
+
+template<typename IntType>
+  void uniform_int_distribution<IntType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end uniform_int_distribution::param()
+
+
+template<typename IntType>
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return a();
+} // end uniform_int_distribution::min()
+
+
+template<typename IntType>
+  typename uniform_int_distribution<IntType>::result_type
+    uniform_int_distribution<IntType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return b();
+} // end uniform_int_distribution::max()
+
+
+template<typename IntType>
+  bool uniform_int_distribution<IntType>
+    ::equal(const uniform_int_distribution &rhs) const
+{
+  return param() == rhs.param();
+}
+
+
+template<typename IntType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      uniform_int_distribution<IntType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << a() << space << b();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename IntType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      uniform_int_distribution<IntType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename IntType>
+bool operator==(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename IntType>
+bool operator!=(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_int_distribution<IntType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_int_distribution<IntType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/uniform_real_distribution.inl b/compat/thrust/random/detail/uniform_real_distribution.inl
new file mode 100644
index 0000000..6f6d6b5
--- /dev/null
+++ b/compat/thrust/random/detail/uniform_real_distribution.inl
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/uniform_real_distribution.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+template<typename RealType>
+  uniform_real_distribution<RealType>
+    ::uniform_real_distribution(RealType a, RealType b)
+      :m_param(a,b)
+{
+} // end uniform_real_distribution::uniform_real_distribution()
+
+template<typename RealType>
+  uniform_real_distribution<RealType>
+    ::uniform_real_distribution(const param_type &parm)
+      :m_param(parm)
+{
+} // end uniform_real_distribution::uniform_real_distribution()
+
+template<typename RealType>
+  void uniform_real_distribution<RealType>
+    ::reset(void)
+{
+} // end uniform_real_distribution::reset()
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    typename uniform_real_distribution<RealType>::result_type
+      uniform_real_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng)
+{
+  return operator()(urng, m_param);
+} // end uniform_real::operator()()
+
+template<typename RealType>
+  template<typename UniformRandomNumberGenerator>
+    typename uniform_real_distribution<RealType>::result_type
+      uniform_real_distribution<RealType>
+        ::operator()(UniformRandomNumberGenerator &urng,
+                     const param_type &parm)
+{
+  // call the urng & map its result to [0,1)
+  result_type result = static_cast<result_type>(urng() - UniformRandomNumberGenerator::min);
+
+  // adding one to the denominator ensures that the interval is half-open at 1.0
+  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
+  // XXX OTOH adding 1 to what is potentially UINT_MAX also seems like a bad idea
+  // XXX we could statically check if 1u + (max - min) is representable and do that, otherwise use the current implementation
+  result /= (result_type(1) + static_cast<result_type>(UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min));
+
+  return (result * (parm.second - parm.first)) + parm.first;
+} // end uniform_real::operator()()
+
+template<typename RealType>
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::a(void) const
+{
+  return m_param.first;
+} // end uniform_real::a()
+
+template<typename RealType>
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::b(void) const
+{
+  return m_param.second;
+} // end uniform_real_distribution::b()
+
+template<typename RealType>
+  typename uniform_real_distribution<RealType>::param_type
+    uniform_real_distribution<RealType>
+      ::param(void) const
+{
+  return m_param;;
+} // end uniform_real_distribution::param()
+
+template<typename RealType>
+  void uniform_real_distribution<RealType>
+    ::param(const param_type &parm)
+{
+  m_param = parm;
+} // end uniform_real_distribution::param()
+
+template<typename RealType>
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return a();
+} // end uniform_real_distribution::min()
+
+template<typename RealType>
+  typename uniform_real_distribution<RealType>::result_type
+    uniform_real_distribution<RealType>
+      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
+{
+  return b();
+} // end uniform_real_distribution::max()
+
+
+template<typename RealType>
+  bool uniform_real_distribution<RealType>
+    ::equal(const uniform_real_distribution &rhs) const
+{
+  return m_param == rhs.param();
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>&
+      uniform_real_distribution<RealType>
+        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  os << a() << space << b();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename RealType>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>&
+      uniform_real_distribution<RealType>
+        ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  is >> m_param.first >> m_param.second;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename RealType>
+bool operator==(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename RealType>
+bool operator!=(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_real_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,d);
+}
+
+
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_real_distribution<RealType> &d)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,d);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/xor_combine_engine.inl b/compat/thrust/random/detail/xor_combine_engine.inl
new file mode 100644
index 0000000..b138722
--- /dev/null
+++ b/compat/thrust/random/detail/xor_combine_engine.inl
@@ -0,0 +1,203 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/random/xor_combine_engine.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(void)
+      :m_b1(),m_b2()
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(const base1_type &urng1, const base2_type &urng2)
+      :m_b1(urng1),m_b2(urng2)
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::xor_combine_engine(result_type s)
+      :m_b1(s),m_b2(s)
+{
+} // end xor_combine_engine::xor_combine_engine()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  void xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::seed(void)
+{
+  m_b1.seed();
+  m_b2.seed();
+} // end xor_combine_engine::seed()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  void xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::seed(result_type s)
+{
+  m_b1.seed(s);
+  m_b2.seed(s);
+} // end xor_combine_engine::seed()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base1_type &
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::base1(void) const
+{
+  return m_b1;
+} // end xor_combine_engine::base1()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base2_type &
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::base2(void) const
+{
+  return m_b2;
+} // end xor_combine_engine::base2()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  typename xor_combine_engine<Engine1,s1,Engine2,s2>::result_type
+    xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::operator()(void)
+{
+  return (result_type(m_b1() - base1_type::min) << shift1) ^
+         (result_type(m_b2() - base2_type::min) << shift2);
+} // end xor_combine_engine::operator()()
+
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2>
+  void xor_combine_engine<Engine1, s1, Engine2, s2>
+    ::discard(unsigned long long z)
+{
+  for(; z > 0; --z)
+  {
+    this->operator()();
+  } // end for
+} // end xor_combine_engine::discard()
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
+{
+  typedef std::basic_ostream<CharT,Traits> ostream_type;
+  typedef typename ostream_type::ios_base  ios_base;
+
+  // save old flags and fill character
+  const typename ios_base::fmtflags flags = os.flags();
+  const CharT fill = os.fill();
+
+  const CharT space = os.widen(' ');
+  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
+  os.fill(space);
+
+  // output each base engine in turn
+  os << base1() << space << base2();
+
+  // restore old flags and fill character
+  os.flags(flags);
+  os.fill(fill);
+  return os;
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
+      ::stream_in(std::basic_istream<CharT,Traits> &is)
+{
+  typedef std::basic_istream<CharT,Traits> istream_type;
+  typedef typename istream_type::ios_base  ios_base;
+
+  // save old flags
+  const typename ios_base::fmtflags flags = is.flags();
+
+  is.flags(ios_base::skipws);
+
+  // input each base engine in turn
+  is >> m_b1 >> m_b2;
+
+  // restore old flags
+  is.flags(flags);
+  return is;
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  bool xor_combine_engine<Engine1,s1,Engine2,s2>
+    ::equal(const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs) const
+{
+  return (m_b1 == rhs.m_b1) && (m_b2 == rhs.m_b2);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const xor_combine_engine<Engine1,s1,Engine2,s2> &e)
+{
+  return thrust::random::detail::random_core_access::stream_out(os,e);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           xor_combine_engine<Engine1,s1,Engine2,s2> &e)
+{
+  return thrust::random::detail::random_core_access::stream_in(is,e);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+bool operator==(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
+                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
+{
+  return thrust::random::detail::random_core_access::equal(lhs,rhs);
+}
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
+                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
+{
+  return !(lhs == rhs);
+}
+
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/detail/xor_combine_engine_max.h b/compat/thrust/random/detail/xor_combine_engine_max.h
new file mode 100644
index 0000000..8bad9a4
--- /dev/null
+++ b/compat/thrust/random/detail/xor_combine_engine_max.h
@@ -0,0 +1,324 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/mpl/math.h>
+#include <limits>
+#include <cstddef>
+
+namespace thrust
+{
+
+namespace random
+{
+
+namespace detail
+{
+
+
+namespace math = thrust::detail::mpl::math;
+
+
+namespace detail
+{
+
+// two cases for this function avoids compile-time warnings of overflow
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs,
+         bool shift_will_overflow>
+  struct lshift_w
+{
+  static const UIntType value = 0;
+};
+
+
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs>
+  struct lshift_w<UIntType,w,lhs,rhs,false>
+{
+  static const UIntType value = lhs << rhs;
+};
+
+} // end detail
+
+
+template<typename UIntType, UIntType w,
+         UIntType lhs, UIntType rhs>
+  struct lshift_w
+{
+  static const bool shift_will_overflow = rhs >= w;
+
+  static const UIntType value = detail::lshift_w<UIntType, w, lhs, rhs, shift_will_overflow>::value;
+};
+
+
+template<typename UIntType, UIntType lhs, UIntType rhs>
+  struct lshift
+    : lshift_w<UIntType, std::numeric_limits<UIntType>::digits, lhs, rhs>
+{};
+
+
+template<typename UIntType, int p>
+  struct two_to_the_power
+    : lshift<UIntType, 1, p>
+{};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  class xor_combine_engine_max_aux_constants
+{
+  public:
+    static const result_type two_to_the_d = two_to_the_power<result_type, d>::value;
+    static const result_type c = lshift<result_type, a, d>::value;
+
+    static const result_type t =
+      math::max<
+        result_type,
+        c,
+        b
+      >::value;
+
+    static const result_type u =
+      math::min<
+        result_type,
+        c,
+        b
+      >::value;
+
+    static const result_type p            = math::log2<u>::value;
+    static const result_type two_to_the_p = two_to_the_power<result_type, p>::value;
+
+    static const result_type k = math::div<result_type, t, two_to_the_p>::value;
+};
+
+
+template<typename result_type, result_type, result_type, int> struct xor_combine_engine_max_aux;
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case4
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      math::div<
+        result_type,
+        math::mod<
+          result_type,
+          constants::u,
+          constants::two_to_the_p
+        >::value,
+        constants::two_to_the_p
+      >::value,
+      math::mod<
+        result_type,
+        constants::t,
+        constants::two_to_the_p
+      >::value,
+      d
+    >::value;
+
+  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case3
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      math::div<
+        result_type,
+        math::mod<
+          result_type,
+          constants::t,
+          constants::two_to_the_p
+        >::value,
+        constants::two_to_the_p
+      >::value,
+      math::mod<
+        result_type,
+        constants::u,
+        constants::two_to_the_p
+      >::value,
+      d
+    >::value;
+
+  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
+};
+
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case2
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type k_plus_1_times_two_to_the_p =
+    lshift<
+      result_type,
+      math::plus<result_type,constants::k,1>::value,
+      constants::p
+    >::value;
+
+  static const result_type value =
+    math::minus<
+      result_type,
+      k_plus_1_times_two_to_the_p,
+      1
+    >::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_case1
+{
+  static const result_type c     = lshift<result_type, a, d>::value;
+
+  static const result_type value = math::plus<result_type,c,b>::value;
+};
+
+
+template<typename result_type, result_type a, result_type b, int d>
+  struct xor_combine_engine_max_aux_2
+{
+  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
+
+  static const result_type value = 
+    thrust::detail::eval_if<
+      // if k is odd...
+      math::is_odd<result_type, constants::k>::value,
+      thrust::detail::identity_<
+        thrust::detail::integral_constant<
+          result_type,
+          xor_combine_engine_max_aux_case2<result_type,a,b,d>::value
+        >
+      >,
+      thrust::detail::eval_if<
+        // otherwise if a * 2^3 >= b, then case 3
+        a * constants::two_to_the_d >= b,
+        thrust::detail::identity_<
+          thrust::detail::integral_constant<
+            result_type,
+            xor_combine_engine_max_aux_case3<result_type,a,b,d>::value
+          >
+        >,
+        // otherwise, case 4
+        thrust::detail::identity_<
+          thrust::detail::integral_constant<
+            result_type,
+            xor_combine_engine_max_aux_case4<result_type,a,b,d>::value
+          >
+        >
+      >
+    >::type::value;
+};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d,
+         bool use_case1 = (a == 0) || (b < two_to_the_power<result_type,d>::value)>
+  struct xor_combine_engine_max_aux_1
+    : xor_combine_engine_max_aux_case1<result_type,a,b,d>
+{};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d>
+  struct xor_combine_engine_max_aux_1<result_type,a,b,d,false>
+    : xor_combine_engine_max_aux_2<result_type,a,b,d>
+{};
+
+
+template<typename result_type,
+         result_type a,
+         result_type b,
+         int d>
+  struct xor_combine_engine_max_aux
+    : xor_combine_engine_max_aux_1<result_type,a,b,d>
+{};
+
+
+template<typename Engine1, size_t s1, typename Engine2, size_t s2, typename result_type>
+  struct xor_combine_engine_max
+{
+  static const size_t w = std::numeric_limits<result_type>::digits;
+
+  static const result_type m1 =
+    math::min<
+      result_type,
+      result_type(Engine1::max - Engine1::min),
+      two_to_the_power<result_type, w-s1>::value - 1 
+    >::value;
+
+  static const result_type m2 =
+    math::min<
+      result_type,
+      result_type(Engine2::max - Engine2::min),
+      two_to_the_power<result_type, w-s2>::value - 1
+    >::value;
+
+  static const result_type s = s1 - s2;
+
+  static const result_type M =
+    xor_combine_engine_max_aux<
+      result_type,
+      m1,
+      m2,
+      s
+    >::value;
+
+  // the value is M(m1,m2,s) lshift_w s2
+  static const result_type value =
+    lshift_w<
+      result_type,
+      w,
+      M,
+      s2
+    >::value;
+}; // end xor_combine_engine_max
+
+} // end detail
+
+} // end random
+
+} // end thrust
+
diff --git a/compat/thrust/random/discard_block_engine.h b/compat/thrust/random/discard_block_engine.h
new file mode 100644
index 0000000..c902c58
--- /dev/null
+++ b/compat/thrust/random/discard_block_engine.h
@@ -0,0 +1,252 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file discard_block_engine.h
+ *  \brief A random number engine which adapts a base engine and produces
+ *         numbers by discarding all but a contiguous blocks of its values.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/config.h>
+#include <iostream>
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_adaptors Random Number Engine Adaptor Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class discard_block_engine
+ *  \brief A \p discard_block_engine adapts an existing base random number engine and produces
+ *         random values by discarding some of the values returned by its base engine.
+ *         Each cycle of the compound engine begins by returning \c r values successively produced
+ *         by the base engine and ends by discarding <tt>p-r</tt> such values. The engine's state
+ *         is the state of its base engine followed by the number of calls to <tt>operator()</tt>
+ *         that have occurred since the beginning of the current cycle.
+ *
+ *  \tparam Engine The type of the base random number engine to adapt.
+ *  \tparam p The discard cycle length.
+ *  \tparam r The number of values to return of the base engine. Because <tt>p-r</tt> will be
+ *            discarded, <tt>r <= p</tt>.
+ *
+ *  The following code snippet shows an example of using a \p discard_block_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/discard_block_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create a discard_block_engine from minstd_rand, with a cycle length of 13
+ *    // keep every first 10 values, and discard the next 3
+ *    thrust::discard_block_engine<thrust::minstd_rand, 13, 10> rng;
+ *
+ *    // print a random number to standard output
+ *    std::cout << rng() << std::endl;
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */         
+template<typename Engine, size_t p, size_t r>
+  class discard_block_engine
+{
+  public:
+    // types
+
+    /*! \typedef base_type
+     *  \brief The type of the adapted base random number engine.
+     */
+    typedef Engine base_type;
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
+     */
+    typedef typename base_type::result_type result_type;
+
+    // engine characteristics
+
+    /*! The length of the production cycle.
+     */
+    static const size_t block_size = p;
+
+    /*! The number of used numbers per production cycle.
+     */
+    static const size_t used_block = r;
+
+    /*! The smallest value this \p discard_block_engine may potentially produce.
+     */
+    static const result_type min = base_type::min;
+
+    /*! The largest value this \p discard_block_engine may potentially produce.
+     */
+    static const result_type max = base_type::max;
+
+    // constructors and seeding functions
+
+    /*! This constructor constructs a new \p discard_block_engine and constructs
+     *  its \p base_type engine using its null constructor.
+     */
+    __host__ __device__
+    discard_block_engine();
+
+    /*! This constructor constructs a new \p discard_block_engine using
+     *  a given \p base_type engine to initialize its adapted base engine.
+     *
+     *  \param urng A \p base_type to use to initialize this \p discard_block_engine's
+     *         adapted base engine.
+     */
+    __host__ __device__
+    explicit discard_block_engine(const base_type &urng);
+
+    /*! This constructor initializes a new \p discard_block_engine with a given seed.
+     *  
+     *  \param s The seed used to intialize this \p discard_block_engine's adapted base engine.
+     */
+    __host__ __device__
+    explicit discard_block_engine(result_type s);
+
+    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
+     *  by using its \p default_seed value.
+     */
+    __host__ __device__
+    void seed(void);
+
+    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
+     *  by using the given seed.
+     *
+     *  \param s The seed with which to intialize this \p discard_block_engine's adapted base engine.
+     */
+    __host__ __device__
+    void seed(result_type s);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p discard_block_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p discard_block_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    // property functions
+
+    /*! This member function returns a const reference to this \p discard_block_engine's
+     *  adapted base engine.
+     *
+     *  \return A const reference to the base engine this \p discard_block_engine adapts.
+     */
+    __host__ __device__
+    const base_type &base(void) const;
+
+    /*! \cond
+     */
+  private:
+    base_type m_e;
+    unsigned int m_n;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const discard_block_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end discard_block_engine
+
+
+/*! This function checks two \p discard_block_engines for equality.
+ *  \param lhs The first \p discard_block_engine to test.
+ *  \param rhs The second \p discard_block_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator==(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs);
+
+
+/*! This function checks two \p discard_block_engines for inequality.
+ *  \param lhs The first \p discard_block_engine to test.
+ *  \param rhs The second \p discard_block_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine, size_t p, size_t r>
+__host__ __device__
+bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
+                const discard_block_engine<Engine,p,r> &rhs);
+
+
+/*! This function streams a discard_block_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p discard_block_engine to stream out.
+ *  \return \p os
+ */
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const discard_block_engine<Engine,p,r> &e);
+
+
+/*! This function streams a discard_block_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p discard_block_engine to stream in.
+ *  \return \p is
+ */
+template<typename Engine, size_t p, size_t r,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           discard_block_engine<Engine,p,r> &e);
+
+/*! \} // end random_number_engine_adaptors
+ */
+
+} // end random
+
+// import names into thrust::
+using random::discard_block_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/discard_block_engine.inl>
+
diff --git a/compat/thrust/random/linear_congruential_engine.h b/compat/thrust/random/linear_congruential_engine.h
new file mode 100644
index 0000000..0added0
--- /dev/null
+++ b/compat/thrust/random/linear_congruential_engine.h
@@ -0,0 +1,295 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file linear_congruential_engine.h
+ *  \brief A linear congruential pseudorandom number engine.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <iostream>
+#include <thrust/detail/cstdint.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <thrust/random/detail/linear_congruential_engine_discard.h>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_templates Random Number Engine Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class linear_congruential_engine
+ *  \brief A \p linear_congruential_engine random number engine produces unsigned integer
+ *         random numbers using a linear congruential random number generation algorithm.
+ *
+ *         The generation algorithm has the form <tt>x_i = (a * x_{i-1} + c) mod m</tt>.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam a The multiplier used in the generation algorithm.
+ *  \tparam c The increment used in the generation algorithm.
+ *  \tparam m The modulus used in the generation algorithm.
+ *
+ *  \note Inexperienced users should not use this class template directly.  Instead, use
+ *  \p minstd_rand or \p minstd_rand0.
+ *
+ *  The following code snippet shows examples of use of a \p linear_congruential_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object, which is an instance of linear_congruential_engine
+ *    thrust::minstd_rand rng1;
+ *
+ *    // output some random values to cout
+ *    std::cout << rng1() << std::endl;
+ *
+ *    // a random value is printed
+ *
+ *    // create a new minstd_rand from a seed
+ *    thrust::minstd_rand rng2(13);
+ *
+ *    // discard some random values
+ *    rng2.discard(13);
+ *
+ *    // stream the object to an iostream
+ *    std::cout << rng2 << std::endl;
+ *
+ *    // rng2's current state is printed
+ *
+ *    // print the minimum and maximum values that minstd_rand can produce
+ *    std::cout << thrust::minstd_rand::min << std::endl;
+ *    std::cout << thrust::minstd_rand::max << std::endl;
+ *
+ *    // the range of minstd_rand is printed
+ *
+ *    // save the state of rng2 to a different object
+ *    thrust::minstd_rand rng3 = rng2;
+ *
+ *    // compare rng2 and rng3
+ *    std::cout << (rng2 == rng3) << std::endl;
+ *
+ *    // 1 is printed
+ *
+ *    // re-seed rng2 with a different seed
+ *    rng2.seed(7);
+ *
+ *    // compare rng2 and rng3
+ *    std::cout << (rng2 == rng3) << std::endl;
+ *
+ *    // 0 is printed
+ *
+ *    return 0;
+ *  }
+ *
+ *  \endcode
+ *
+ *  \see thrust::random::minstd_rand
+ *  \see thrust::random::minstd_rand0
+ */
+template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  class linear_congruential_engine
+{
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The multiplier used in the generation algorithm.
+     */
+    static const result_type multiplier = a;
+
+    /*! The increment used in the generation algorithm.
+     */
+    static const result_type increment = c;
+
+    /*! The modulus used in the generation algorithm.
+     */
+    static const result_type modulus = m;
+
+    /*! The smallest value this \p linear_congruential_engine may potentially produce.
+     */
+    static const result_type min = c == 0u ? 1u : 0u;
+
+    /*! The largest value this \p linear_congruential_engine may potentially produce.
+     */
+    static const result_type max = m - 1u;
+
+    /*! The default seed of this \p linear_congruential_engine.
+     */
+    static const result_type default_seed = 1u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p linear_congruential_engine.
+     *  
+     *  \param s The seed used to intialize this \p linear_congruential_engine's state.
+     */
+    __host__ __device__
+    explicit linear_congruential_engine(result_type s = default_seed);
+
+    /*! This method initializes this \p linear_congruential_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param s The seed used to initializes this \p linear_congruential_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type s = default_seed);
+
+    // generating functions
+
+    /*! This member function produces a new random value and updates this \p linear_congruential_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p linear_congruential_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_x;
+
+    static void transition(result_type &state);
+
+    friend struct thrust::random::detail::random_core_access;
+
+    friend struct thrust::random::detail::linear_congruential_engine_discard;
+
+    __host__ __device__
+    bool equal(const linear_congruential_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end linear_congruential_engine
+
+
+/*! This function checks two \p linear_congruential_engines for equality.
+ *  \param lhs The first \p linear_congruential_engine to test.
+ *  \param rhs The second \p linear_congruential_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
+
+
+/*! This function checks two \p linear_congruential_engines for inequality.
+ *  \param lhs The first \p linear_congruential_engine to test.
+ *  \param rhs The second \p linear_congruential_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
+__host__ __device__
+bool operator!=(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
+                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
+
+
+/*! This function streams a linear_congruential_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p linear_congruential_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_congruential_engine<UIntType_,a_,c_,m_> &e);
+
+
+/*! This function streams a linear_congruential_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p linear_congruential_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_congruential_engine<UIntType_,a_,c_,m_> &e);
+
+
+/*! \} // random_number_engine_templates
+ */
+
+
+/*! \addtogroup predefined_random
+ *  \{
+ */
+
+// XXX the type N2111 used here was uint_fast32_t
+
+/*! \typedef minstd_rand0
+ *  \brief A random number engine with predefined parameters which implements a version of
+ *         the Minimal Standard random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand0
+ *        shall produce the value \c 1043618065 .
+ */
+typedef linear_congruential_engine<thrust::detail::uint32_t, 16807, 0, 2147483647> minstd_rand0;
+
+
+/*! \typedef minstd_rand
+ *  \brief A random number engine with predefined parameters which implements a version of
+ *         the Minimal Standard random number generation algorithm.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand
+ *        shall produce the value \c 399268537 .
+ */
+typedef linear_congruential_engine<thrust::detail::uint32_t, 48271, 0, 2147483647> minstd_rand;
+
+/*! \} // predefined_random
+ */
+  
+} // end random
+
+// import names into thrust::
+using random::linear_congruential_engine;
+using random::minstd_rand;
+using random::minstd_rand0;
+
+} // end thrust
+
+#include <thrust/random/detail/linear_congruential_engine.inl>
+
diff --git a/compat/thrust/random/linear_feedback_shift_engine.h b/compat/thrust/random/linear_feedback_shift_engine.h
new file mode 100644
index 0000000..f5646c9
--- /dev/null
+++ b/compat/thrust/random/linear_feedback_shift_engine.h
@@ -0,0 +1,230 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file linear_feedback_shift_engine.h
+ *  \brief A linear feedback shift pseudorandom number generator.
+ */
+
+/*
+ * Copyright Jens Maurer 2002
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/random/detail/linear_feedback_shift_engine_wordmask.h>
+#include <iostream>
+#include <cstddef> // for size_t
+#include <thrust/random/detail/random_core_access.h>
+
+namespace thrust
+{
+
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_templates
+ *  \{
+ */
+
+/*! \class linear_feedback_shift_engine
+ *  \brief A \p linear_feedback_shift_engine random number engine produces
+ *         unsigned integer random values using a linear feedback shift random number
+ *         generation algorithm.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam w The word size of the produced values (<tt>w <= sizeof(UIntType)</tt>).
+ *  \tparam k The k parameter of Tausworthe's 1965 algorithm.
+ *  \tparam q The q exponent of Tausworthe's 1965 algorithm.
+ *  \tparam s The step size of Tausworthe's 1965 algorithm.
+ *
+ *  \note linear_feedback_shift_engine is based on the Boost Template Library's linear_feedback_shift.
+ */
+template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  class linear_feedback_shift_engine
+{
+  public:
+    // types
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p linear_feedback_shift_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The word size of the produced values.
+     */
+    static const size_t word_size = w;
+
+    /*! A constant used in the generation algorithm.
+     */
+    static const size_t exponent1 = k;
+
+    /*! A constant used in the generation algorithm.
+     */
+    static const size_t exponent2 = q;
+
+    /*! The step size used in the generation algorithm.
+     */
+    static const size_t step_size = s;
+
+    /*! \cond
+     */
+  private:
+    static const result_type wordmask =
+      detail::linear_feedback_shift_engine_wordmask<
+        result_type,
+        w
+      >::value;
+    /*! \endcond
+     */
+
+  public:
+
+    /*! The smallest value this \p linear_feedback_shift_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p linear_feedback_shift_engine may potentially produce.
+     */
+    static const result_type max = wordmask;
+
+    /*! The default seed of this \p linear_feedback_shift_engine.
+     */
+    static const result_type default_seed = 341u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p linear_feedback_shift_engine.
+     *  
+     *  \param value The seed used to intialize this \p linear_feedback_shift_engine's state.
+     */
+    __host__ __device__
+    explicit linear_feedback_shift_engine(result_type value = default_seed);
+
+    /*! This method initializes this \p linear_feedback_shift_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param value The seed used to initializes this \p linear_feedback_shift_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type value = default_seed);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p linear_feedback_shift_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p linear_feedback_shift_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_value;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const linear_feedback_shift_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end linear_feedback_shift_engine
+
+
+/*! This function checks two \p linear_feedback_shift_engines for equality.
+ *  \param lhs The first \p linear_feedback_shift_engine to test.
+ *  \param rhs The second \p linear_feedback_shift_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
+__host__ __device__
+bool operator==(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
+                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
+
+
+/*! This function checks two \p linear_feedback_shift_engines for inequality.
+ *  \param lhs The first \p linear_feedback_shift_engine to test.
+ *  \param rhs The second \p linear_feedback_shift_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
+__host__ __device__
+bool operator!=(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
+                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
+
+
+/*! This function streams a linear_feedback_shift_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p linear_feedback_shift_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
+
+
+/*! This function streams a linear_feedback_shift_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p linear_feedback_shift_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
+
+
+/*! \} // end random_number_engine_templates
+ */
+
+
+} // end random
+
+// import names into thrust::
+using random::linear_feedback_shift_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/linear_feedback_shift_engine.inl>
+
diff --git a/compat/thrust/random/normal_distribution.h b/compat/thrust/random/normal_distribution.h
new file mode 100644
index 0000000..5543f30
--- /dev/null
+++ b/compat/thrust/random/normal_distribution.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file normal_distribution.h
+ *  \brief A normal (Gaussian) distribution of real-valued numbers.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <thrust/random/detail/normal_distribution_base.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_distributions
+ *  \{
+ */
+
+/*! \class normal_distribution
+ *  \brief A \p normal_distribution random number distribution produces floating point
+ *         Normally distributed random numbers.
+ *
+ *  \tparam RealType The type of floating point number to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p normal_distribution with a 
+ *  random number engine to produce random values drawn from the Normal distribution with a given
+ *  mean and variance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/normal_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a normal_distribution to produce floats from the Normal distribution
+ *    // with mean 2.0 and standard deviation 3.5
+ *    thrust::random::normal_distribution<float> dist(2.0f, 3.5f);
+ *
+ *    // write a random number to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the mean of the distribution, just in case we forgot
+ *    std::cout << dist.mean() << std::endl;
+ *
+ *    // 2.0 is printed
+ *
+ *    // and the standard deviation
+ *    std::cout << dist.stddev() << std::endl;
+ *
+ *    // 3.5 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename RealType = double>
+  class normal_distribution
+    : public detail::normal_distribution_base<RealType>::type
+{
+  private:
+    typedef typename detail::normal_distribution_base<RealType>::type super_t;
+
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the floating point number produced by this \p normal_distribution.
+     */
+    typedef RealType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p normal_distribution's parameters.
+     */
+    typedef thrust::pair<RealType,RealType> param_type;
+
+    // constructors and reset functions
+    
+    /*! This constructor creates a new \p normal_distribution from two values defining the
+     *  half-open interval of the distribution.
+     *  
+     *  \param mean The mean (expected value) of the distribution. Defaults to \c 0.0.
+     *  \param stddev The standard deviation of the distribution. Defaults to \c 1.0.
+     */
+    __host__ __device__
+    explicit normal_distribution(RealType mean = 0.0, RealType stddev = 1.0);
+
+    /*! This constructor creates a new \p normal_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of the distribution.
+     */
+    __host__ __device__
+    explicit normal_distribution(const param_type &parm);
+
+    /*! Calling this member function guarantees that subsequent uses of this
+     *  \p normal_distribution do not depend on values produced by any random
+     *  number generator prior to invoking this function.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new Normal random integer drawn from this \p normal_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new Normal random integer as if by creating a new \p normal_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p normal_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+
+    /*! This method returns the value of the parameter with which this \p normal_distribution
+     *  was constructed.
+     *
+     *  \return The mean (expected value) of this \p normal_distribution's output.
+     */
+    __host__ __device__
+    result_type mean(void) const;
+
+    /*! This method returns the value of the parameter with which this \p normal_distribution
+     *  was constructed.
+     *
+     *  \return The standard deviation of this \p uniform_real_distribution's output.
+     */
+    __host__ __device__
+    result_type stddev(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p normal_distribution was constructed.
+     *
+     *  \return A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of this \p normal_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p normal_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new parameters (i.e., the mean and variance) of this \p normal_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest floating point number this \p normal_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p normal_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p normal_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const normal_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end normal_distribution
+
+
+/*! This function checks two \p normal_distributions for equality.
+ *  \param lhs The first \p normal_distribution to test.
+ *  \param rhs The second \p normal_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator==(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs);
+
+
+/*! This function checks two \p normal_distributions for inequality.
+ *  \param lhs The first \p normal_distribution to test.
+ *  \param rhs The second \p normal_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator!=(const normal_distribution<RealType> &lhs,
+                const normal_distribution<RealType> &rhs);
+
+
+/*! This function streams a normal_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p normal_distribution to stream out.
+ *  \return \p os
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const normal_distribution<RealType> &d);
+
+
+/*! This function streams a normal_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p normal_distribution to stream in.
+ *  \return \p is
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           normal_distribution<RealType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::normal_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/normal_distribution.inl>
+
diff --git a/compat/thrust/random/subtract_with_carry_engine.h b/compat/thrust/random/subtract_with_carry_engine.h
new file mode 100644
index 0000000..b888100
--- /dev/null
+++ b/compat/thrust/random/subtract_with_carry_engine.h
@@ -0,0 +1,256 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file subtract_with_carry_engine.h
+ *  \brief A subtract-with-carry pseudorandom number generator
+ *         based on Marsaglia & Zaman.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/random/detail/random_core_access.h>
+
+#include <thrust/detail/cstdint.h>
+#include <cstddef> // for size_t
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_engine_templates
+ *  \{
+ */
+
+/*! \class subtract_with_carry_engine
+ *  \brief A \p subtract_with_carry_engine random number engine produces unsigned
+ *         integer random numbers using the subtract with carry algorithm of Marsaglia & Zaman.
+ *
+ *         The generation algorithm is performed as follows:
+ *         -# Let <tt>Y = X_{i-s}- X_{i-r} - c</tt>.
+ *         -# Set <tt>X_i</tt> to <tt>y = T mod m</tt>. Set \c c to \c 1 if <tt>Y < 0</tt>, otherwise set \c c to \c 0.
+ *
+ *         This algorithm corresponds to a modular linear function of the form
+ *
+ *         <tt>TA(x_i) = (a * x_i) mod b</tt>, where \c b is of the form <tt>m^r - m^s + 1</tt> and
+ *         <tt>a = b - (b-1)/m</tt>.
+ *
+ *  \tparam UIntType The type of unsigned integer to produce.
+ *  \tparam w The word size of the produced values (<tt> w <= sizeof(UIntType)</tt>).
+ *  \tparam s The short lag of the generation algorithm.
+ *  \tparam r The long lag of the generation algorithm.
+ *
+ *  \note Inexperienced users should not use this class template directly.  Instead, use
+ *  \p ranlux24_base or \p ranlux48_base, which are instances of \p subtract_with_carry_engine.
+ *
+ *  \see thrust::random::ranlux24_base
+ *  \see thrust::random::ranlux48_base
+ */
+template<typename UIntType, size_t w, size_t s, size_t r>
+  class subtract_with_carry_engine
+{
+    /*! \cond
+     */
+  private:
+    static const UIntType modulus = UIntType(1) << w;
+    /*! \endcond
+     */
+
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p subtract_with_carry_engine.
+     */
+    typedef UIntType result_type;
+
+    // engine characteristics
+
+    /*! The word size of the produced values.
+     */
+    static const size_t word_size = w;
+
+    /*! The size of the short lag used in the generation algorithm.
+     */
+    static const size_t short_lag = s;
+
+    /*! The size of the long lag used in the generation algorithm.
+     */
+    static const size_t long_lag = r;
+
+    /*! The smallest value this \p subtract_with_carry_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p subtract_with_carry_engine may potentially produce.
+     */
+    static const result_type max = modulus - 1;
+
+    /*! The default seed of this \p subtract_with_carry_engine.
+     */
+    static const result_type default_seed = 19780503u;
+
+    // constructors and seeding functions
+
+    /*! This constructor, which optionally accepts a seed, initializes a new
+     *  \p subtract_with_carry_engine.
+     *  
+     *  \param value The seed used to intialize this \p subtract_with_carry_engine's state.
+     */
+    __host__ __device__
+    explicit subtract_with_carry_engine(result_type value = default_seed);
+
+    /*! This method initializes this \p subtract_with_carry_engine's state, and optionally accepts
+     *  a seed value.
+     *
+     *  \param value The seed used to initializes this \p subtract_with_carry_engine's state.
+     */
+    __host__ __device__
+    void seed(result_type value = default_seed);
+
+    // generating functions
+    
+    /*! This member function produces a new random value and updates this \p subtract_with_carry_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p subtract_with_carry_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    /*! \cond
+     */
+  private:
+    result_type m_x[long_lag];
+    unsigned int m_k;
+    int m_carry;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const subtract_with_carry_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    /*! \endcond
+     */
+}; // end subtract_with_carry_engine
+
+
+/*! This function checks two \p subtract_with_carry_engines for equality.
+ *  \param lhs The first \p subtract_with_carry_engine to test.
+ *  \param rhs The second \p subtract_with_carry_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_>
+__host__ __device__
+bool operator==(const subtract_with_carry_engine<UIntType_,w_,s_,r_> &lhs,
+                const subtract_with_carry_engine<UIntType_,w_,s_,r_> &rhs);
+
+
+/*! This function checks two \p subtract_with_carry_engines for inequality.
+ *  \param lhs The first \p subtract_with_carry_engine to test.
+ *  \param rhs The second \p subtract_with_carry_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_>
+__host__ __device__
+bool operator!=(const subtract_with_carry_engine<UIntType_,w_,s_,r_>&lhs,
+                const subtract_with_carry_engine<UIntType_,w_,s_,r_>&rhs);
+
+
+/*! This function streams a subtract_with_carry_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p subtract_with_carry_engine to stream out.
+ *  \return \p os
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
+
+
+/*! This function streams a subtract_with_carry_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p subtract_with_carry_engine to stream in.
+ *  \return \p is
+ */
+template<typename UIntType_, size_t w_, size_t s_, size_t r_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
+
+
+/*! \} // end random_number_engine_templates
+ */
+
+
+/*! \addtogroup predefined_random
+ *  \{
+ */
+
+// XXX N2111 uses uint_fast32_t here
+
+/*! \typedef ranlux24_base
+ *  \brief A random number engine with predefined parameters which implements the
+ *         base engine of the \p ranlux24 random number engine.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24_base
+ *        shall produce the value \c 7937952 .
+ */
+typedef subtract_with_carry_engine<thrust::detail::uint32_t, 24, 10, 24> ranlux24_base;
+
+
+// XXX N2111 uses uint_fast64_t here
+
+/*! \typedef ranlux48_base
+ *  \brief A random number engine with predefined parameters which implements the
+ *         base engine of the \p ranlux48 random number engine.
+ *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48_base
+ *        shall produce the value \c 192113843633948 .
+ */
+typedef subtract_with_carry_engine<thrust::detail::uint64_t, 48,  5, 12> ranlux48_base;
+
+/*! \} // end predefined_random
+ */
+
+} // end random
+
+// import names into thrust::
+using random::subtract_with_carry_engine;
+using random::ranlux24_base;
+using random::ranlux48_base;
+
+} // end thrust
+
+#include <thrust/random/detail/subtract_with_carry_engine.inl>
+
diff --git a/compat/thrust/random/uniform_int_distribution.h b/compat/thrust/random/uniform_int_distribution.h
new file mode 100644
index 0000000..d05f7fa
--- /dev/null
+++ b/compat/thrust/random/uniform_int_distribution.h
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uniform_int_distribution.h
+ *  \brief A uniform distribution of integer-valued numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/integer_traits.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_distributions Random Number Distributions Class Templates
+ *  \ingroup random
+ *  \{
+ */
+
+/*! \class uniform_int_distribution
+ *  \brief A \p uniform_int_distribution random number distribution produces signed or unsigned integer
+ *         uniform random numbers from a given range.
+ *
+ *  \tparam IntType The type of integer to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p uniform_int_distribution with a 
+ *  random number engine to produce random integers drawn from a given range:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/uniform_int_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a uniform_int_distribution to produce ints from [-7,13]
+ *    thrust::uniform_int_distribution<int> dist(-7,13);
+ *
+ *    // write a random number from the range [-7,13] to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the range of the distribution, just in case we forgot
+ *    std::cout << dist.min() << std::endl;
+ *
+ *    // -7 is printed
+ *
+ *    std::cout << dist.max() << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    // write the parameters of the distribution (which happen to be the bounds) to standard output
+ *    std::cout << dist.a() << std::endl;
+ *
+ *    // -7 is printed
+ *
+ *    std::cout << dist.b() << std::endl;
+ *
+ *    // 13 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename IntType = int>
+  class uniform_int_distribution
+{
+  public:
+    // types
+
+    /*! \typedef result_type
+     *  \brief The type of the integer produced by this \p uniform_int_distribution.
+     */
+    typedef IntType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p uniform_int_distribution's parameters.
+     */
+    typedef thrust::pair<IntType,IntType> param_type;
+
+    // constructors and reset functions
+
+    /*! This constructor creates a new \p uniform_int_distribution from two values defining the
+     *  range of the distribution.
+     *  
+     *  \param a The smallest integer to potentially produce. Defaults to \c 0.
+     *  \param b The largest integer to potentially produce. Defaults to the largest representable integer in
+     *           the platform.
+     */
+    __host__ __device__
+    explicit uniform_int_distribution(IntType a = 0, IntType b = thrust::detail::integer_traits<IntType>::const_max);
+
+    /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
+     */
+    __host__ __device__
+    explicit uniform_int_distribution(const param_type &parm);
+
+    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new uniform random integer drawn from this \p uniform_int_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new uniform random integer as if by creating a new \p uniform_int_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_int_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+    
+    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
+     *  was constructed.
+     *
+     *  \return The lower bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type a(void) const;
+
+    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
+     *  was constructed.
+     *
+     *  \return The upper bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type b(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p uniform_int_distribution was constructed.
+     *
+     *  \return A \p param_type object enapsulating the range of this \p uniform_int_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p uniform_int_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new range of this \p uniform_int_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest integer this \p uniform_int_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the largest integer this \p uniform_int_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p uniform_int_distribution's range.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const uniform_int_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end uniform_int_distribution
+
+
+/*! This function checks two \p uniform_int_distributions for equality.
+ *  \param lhs The first \p uniform_int_distribution to test.
+ *  \param rhs The second \p uniform_int_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename IntType>
+__host__ __device__
+bool operator==(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs);
+
+
+/*! This function checks two \p uniform_int_distributions for inequality.
+ *  \param lhs The first \p uniform_int_distribution to test.
+ *  \param rhs The second \p uniform_int_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename IntType>
+__host__ __device__
+bool operator!=(const uniform_int_distribution<IntType> &lhs,
+                const uniform_int_distribution<IntType> &rhs);
+
+
+/*! This function streams a uniform_int_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p uniform_int_distribution to stream out.
+ *  \return \p os
+ */
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_int_distribution<IntType> &d);
+
+
+/*! This function streams a uniform_int_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p uniform_int_distribution to stream in.
+ *  \return \p is
+ */
+template<typename IntType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_int_distribution<IntType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::uniform_int_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/uniform_int_distribution.inl>
+
diff --git a/compat/thrust/random/uniform_real_distribution.h b/compat/thrust/random/uniform_real_distribution.h
new file mode 100644
index 0000000..ab85ab3
--- /dev/null
+++ b/compat/thrust/random/uniform_real_distribution.h
@@ -0,0 +1,274 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uniform_real_distribution.h
+ *  \brief A uniform distribution of real-valued numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace random
+{
+
+
+/*! \addtogroup random_number_distributions
+ *  \{
+ */
+
+/*! \class uniform_real_distribution
+ *  \brief A \p uniform_real_distribution random number distribution produces floating point
+ *         uniform random numbers from a half-open interval.
+ *
+ *  \tparam RealType The type of floating point number to produce.
+ *
+ *  The following code snippet demonstrates examples of using a \p uniform_real_distribution with a 
+ *  random number engine to produce random integers drawn from a given range:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/uniform_real_distribution.h>
+ *
+ *  int main(void)
+ *  {
+ *    // create a minstd_rand object to act as our source of randomness
+ *    thrust::minstd_rand rng;
+ *
+ *    // create a uniform_real_distribution to produce floats from [-7,13)
+ *    thrust::uniform_real_distribution<float> dist(-7,13);
+ *
+ *    // write a random number from the range [-7,13) to standard output
+ *    std::cout << dist(rng) << std::endl;
+ *
+ *    // write the range of the distribution, just in case we forgot
+ *    std::cout << dist.min() << std::endl;
+ *
+ *    // -7.0 is printed
+ *
+ *    std::cout << dist.max() << std::endl;
+ *
+ *    // 13.0 is printed
+ *
+ *    // write the parameters of the distribution (which happen to be the bounds) to standard output
+ *    std::cout << dist.a() << std::endl;
+ *
+ *    // -7.0 is printed
+ *
+ *    std::cout << dist.b() << std::endl;
+ *
+ *    // 13.0 is printed
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename RealType = double>
+  class uniform_real_distribution
+{
+  public:
+    // types
+    
+    /*! \typedef result_type
+     *  \brief The type of the floating point number produced by this \p uniform_real_distribution.
+     */
+    typedef RealType result_type;
+
+    /*! \typedef param_type
+     *  \brief The type of the object encapsulating this \p uniform_real_distribution's parameters.
+     */
+    typedef thrust::pair<RealType,RealType> param_type;
+
+    // constructors and reset functions
+    
+    /*! This constructor creates a new \p uniform_real_distribution from two values defining the
+     *  half-open interval of the distribution.
+     *  
+     *  \param a The smallest floating point number to potentially produce. Defaults to \c 0.0.
+     *  \param b The smallest number larger than the largest floating point number to potentially produce. Defaults to \c 1.0.
+     */
+    __host__ __device__
+    explicit uniform_real_distribution(RealType a = 0.0, RealType b = 1.0);
+
+    /*! This constructor creates a new \p uniform_real_distribution from a \p param_type object
+     *  encapsulating the range of the distribution.
+     *  
+     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
+     */
+    __host__ __device__
+    explicit uniform_real_distribution(const param_type &parm);
+
+    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
+     */
+    __host__ __device__
+    void reset(void);
+
+    // generating functions
+
+    /*! This method produces a new uniform random integer drawn from this \p uniform_real_distribution's
+     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng);
+
+    /*! This method produces a new uniform random integer as if by creating a new \p uniform_real_distribution 
+     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
+     *  \p UniformRandomNumberGenerator as a source of randomness.
+     *
+     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
+     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_real_distribution
+     *              to draw from.
+     */
+    template<typename UniformRandomNumberGenerator>
+    __host__ __device__
+    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
+
+    // property functions
+
+    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
+     *  was constructed.
+     *
+     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type a(void) const;
+
+    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
+     *  was constructed.
+     *
+     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type b(void) const;
+
+    /*! This method returns a \p param_type object encapsulating the parameters with which this
+     *  \p uniform_real_distribution was constructed.
+     *
+     *  \return A \p param_type object enapsulating the half-open interval of this \p uniform_real_distribution.
+     */
+    __host__ __device__
+    param_type param(void) const;
+
+    /*! This method changes the parameters of this \p uniform_real_distribution using the values encapsulated
+     *  in a given \p param_type object.
+     *
+     *  \param parm A \p param_type object encapsulating the new half-open interval of this \p uniform_real_distribution.
+     */
+    __host__ __device__
+    void param(const param_type &parm);
+
+    /*! This method returns the smallest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
+     *
+     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
+     */
+    __host__ __device__
+    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
+
+    /*! \cond
+     */
+  private:
+    param_type m_param;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const uniform_real_distribution &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+    /*! \endcond
+     */
+}; // end uniform_real_distribution
+
+
+/*! This function checks two \p uniform_real_distributions for equality.
+ *  \param lhs The first \p uniform_real_distribution to test.
+ *  \param rhs The second \p uniform_real_distribution to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator==(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs);
+
+
+/*! This function checks two \p uniform_real_distributions for inequality.
+ *  \param lhs The first \p uniform_real_distribution to test.
+ *  \param rhs The second \p uniform_real_distribution to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename RealType>
+__host__ __device__
+bool operator!=(const uniform_real_distribution<RealType> &lhs,
+                const uniform_real_distribution<RealType> &rhs);
+
+
+/*! This function streams a uniform_real_distribution to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param d The \p uniform_real_distribution to stream out.
+ *  \return \p os
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const uniform_real_distribution<RealType> &d);
+
+
+/*! This function streams a uniform_real_distribution in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param d The \p uniform_real_distribution to stream in.
+ *  \return \p is
+ */
+template<typename RealType,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           uniform_real_distribution<RealType> &d);
+
+
+/*! \} // end random_number_distributions
+ */
+
+
+} // end random
+
+using random::uniform_real_distribution;
+
+} // end thrust
+
+#include <thrust/random/detail/uniform_real_distribution.inl>
+
diff --git a/compat/thrust/random/xor_combine_engine.h b/compat/thrust/random/xor_combine_engine.h
new file mode 100644
index 0000000..61eb5a5
--- /dev/null
+++ b/compat/thrust/random/xor_combine_engine.h
@@ -0,0 +1,271 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file xor_combine_engine.h
+ *  \brief A pseudorandom number generator which produces pseudorandom
+ *         numbers from two integer base engines by merging their
+ *         pseudorandom numbers with bitwise exclusive-or.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/random/detail/xor_combine_engine_max.h>
+#include <thrust/random/detail/random_core_access.h>
+#include <iostream>
+#include <cstddef> // for size_t
+
+namespace thrust
+{
+
+namespace random
+{
+
+/*! \addtogroup random_number_engine_adaptors
+ *  \{
+ */
+
+/*! \class xor_combine_engine
+ *  \brief An \p xor_combine_engine adapts two existing base random number engines and
+ *         produces random values by combining the values produced by each.
+ *
+ *  \tparam Engine1 The type of the first base random number engine to adapt.
+ *  \tparam s1 The size of the first shift to use in the generation algorithm.
+ *  \tparam Engine2 The type of the second base random number engine to adapt.
+ *  \tparam s2 The second of the second shift to use in the generation algorithm. Defaults to \c 0.
+ *
+ *  The following code snippet shows an example of using an \p xor_combine_engine instance:
+ *
+ *  \code
+ *  #include <thrust/random/linear_congruential_engine.h>
+ *  #include <thrust/random/xor_combine_engine.h>
+ *  #include <iostream>
+ *
+ *  int main(void)
+ *  {
+ *    // create an xor_combine_engine from minstd_rand and minstd_rand0
+ *    // use a shift of 0 for each
+ *    thrust::xor_combine_engine<thrust::minstd_rand,0,thrust::minstd_rand0,0> rng;
+ *
+ *    // print a random number to standard output
+ *    std::cout << rng() << std::endl;
+ *
+ *    return 0;
+ *  }
+ *  \endcode
+ */
+template<typename Engine1, size_t s1,
+         typename Engine2, size_t s2=0u>
+  class xor_combine_engine
+{
+  public:
+    // types
+
+    /*! \typedef base1_type
+     *  \brief The type of the first adapted base random number engine.
+     */
+    typedef Engine1 base1_type;
+
+    /*! \typedef base2_type
+     *  \brief The type of the second adapted base random number engine.
+     */
+    typedef Engine2 base2_type;
+
+    /*! \typedef result_type
+     *  \brief The type of the unsigned integer produced by this \p xor_combine_engine.
+     */
+    typedef typename thrust::detail::eval_if<
+      (sizeof(typename base2_type::result_type) > sizeof(typename base1_type::result_type)),
+      thrust::detail::identity_<typename base2_type::result_type>,
+      thrust::detail::identity_<typename base1_type::result_type>
+    >::type result_type;
+    
+    /*! The size of the first shift used in the generation algorithm.
+     */
+    static const size_t shift1 = s1;
+
+    /*! The size of the second shift used in the generation algorithm.
+     */
+    static const size_t shift2 = s2;
+
+    /*! The smallest value this \p xor_combine_engine may potentially produce.
+     */
+    static const result_type min = 0;
+
+    /*! The largest value this \p xor_combine_engine may potentially produce.
+     */
+    static const result_type max =
+      detail::xor_combine_engine_max<
+        Engine1, s1, Engine2, s2, result_type
+      >::value;
+
+    // constructors and seeding functions
+
+    /*! This constructor constructs a new \p xor_combine_engine and constructs
+     *  its adapted engines using their null constructors.
+     */
+    __host__ __device__
+    xor_combine_engine(void);
+
+    /*! This constructor constructs a new \p xor_combine_engine using
+     *  given \p base1_type and \p base2_type engines to initialize its adapted base engines.
+     *
+     *  \param urng1 A \p base1_type to use to initialize this \p xor_combine_engine's
+     *         first adapted base engine.
+     *  \param urng2 A \p base2_type to use to initialize this \p xor_combine_engine's
+     *         first adapted base engine.
+     */
+    __host__ __device__
+    xor_combine_engine(const base1_type &urng1, const base2_type &urng2);
+
+    /*! This constructor initializes a new \p xor_combine_engine with a given seed.
+     *  
+     *  \param s The seed used to intialize this \p xor_combine_engine's adapted base engines.
+     */
+    __host__ __device__
+    xor_combine_engine(result_type s);
+
+    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
+     *  by using their \p default_seed values.
+     */
+    __host__ __device__
+    void seed(void);
+
+    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
+     *  by using the given seed.
+     *
+     *  \param s The seed with which to intialize this \p xor_combine_engine's adapted base engines.
+     */
+    __host__ __device__
+    void seed(result_type s);
+
+    // generating functions
+
+    /*! This member function produces a new random value and updates this \p xor_combine_engine's state.
+     *  \return A new random number.
+     */
+    __host__ __device__
+    result_type operator()(void);
+
+    /*! This member function advances this \p xor_combine_engine's state a given number of times
+     *  and discards the results.
+     *
+     *  \param z The number of random values to discard.
+     *  \note This function is provided because an implementation may be able to accelerate it.
+     */
+    __host__ __device__
+    void discard(unsigned long long z);
+
+    // property functions
+
+    /*! This member function returns a const reference to this \p xor_combine_engine's
+     *  first adapted base engine.
+     *
+     *  \return A const reference to the first base engine this \p xor_combine_engine adapts.
+     */
+    __host__ __device__
+    const base1_type &base1(void) const;
+
+    /*! This member function returns a const reference to this \p xor_combine_engine's
+     *  second adapted base engine.
+     *
+     *  \return A const reference to the second base engine this \p xor_combine_engine adapts.
+     */
+    __host__ __device__
+    const base2_type &base2(void) const;
+
+    /*! \cond
+     */
+  private:
+    base1_type m_b1;
+    base2_type m_b2;
+
+    friend struct thrust::random::detail::random_core_access;
+
+    __host__ __device__
+    bool equal(const xor_combine_engine &rhs) const;
+
+    template<typename CharT, typename Traits>
+    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
+
+    template<typename CharT, typename Traits>
+    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
+
+    /*! \endcond
+     */
+}; // end xor_combine_engine
+
+
+/*! This function checks two \p xor_combine_engines for equality.
+ *  \param lhs The first \p xor_combine_engine to test.
+ *  \param rhs The second \p xor_combine_engine to test.
+ *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
+__host__ __device__
+bool operator==(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
+                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
+
+
+/*! This function checks two \p xor_combine_engines for inequality.
+ *  \param lhs The first \p xor_combine_engine to test.
+ *  \param rhs The second \p xor_combine_engine to test.
+ *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
+__host__ __device__
+bool operator!=(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
+                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
+
+
+/*! This function streams a xor_combine_engine to a \p std::basic_ostream.
+ *  \param os The \p basic_ostream to stream out to.
+ *  \param e The \p xor_combine_engine to stream out.
+ *  \return \p os
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
+         typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits> &os,
+           const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
+
+
+/*! This function streams a xor_combine_engine in from a std::basic_istream.
+ *  \param is The \p basic_istream to stream from.
+ *  \param e The \p xor_combine_engine to stream in.
+ *  \return \p is
+ */
+template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
+         typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits> &is,
+           xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
+
+
+/*! \} // end random_number_engine_adaptors
+ */
+
+
+} // end random
+
+// import names into thrust::
+using random::xor_combine_engine;
+
+} // end thrust
+
+#include <thrust/random/detail/xor_combine_engine.inl>
+
diff --git a/compat/thrust/reduce.h b/compat/thrust/reduce.h
new file mode 100644
index 0000000..1dc931f
--- /dev/null
+++ b/compat/thrust/reduce.h
@@ -0,0 +1,779 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief Functions for reducing a range to a single value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ */
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
+ *          \c value_type. If \c T is \c InputIterator's \c value_type, then
+ *          <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host, data, data + 6);
+ *
+ *  // result == 9
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename DerivedPolicy, typename InputIterator>
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
+ *          \c value_type. If \c T is \c InputIterator's \c value_type, then
+ *          <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6);
+ *
+ *  // result == 9
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename InputIterator> typename
+  thrust::iterator_traits<InputIterator>::value_type reduce(InputIterator first, InputIterator last);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p T.
+ *  \tparam T is convertible to \p InputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers including an intialization value using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host, data, data + 6, 1);
+ *
+ *  // result == 10
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename DerivedPolicy, typename InputIterator, typename T>
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction. \p reduce is similar to the C++ Standard Template Library's
+ *  <tt>std::accumulate</tt>. The primary difference between the two functions
+ *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
+ *  \p reduce requires associativity of the binary operation to parallelize
+ *  the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case operator+) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
+ *          then <tt>x + y</tt> is defined and is convertible to \p T.
+ *  \tparam T is convertible to \p InputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to compute
+ *  the sum of a sequence of integers including an intialization value.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6, 1);
+ *
+ *  // result == 10
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ */
+template<typename InputIterator, typename T>
+  T reduce(InputIterator first,
+           InputIterator last,
+           T init);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction and \p binary_op as the binary function used for summation. \p reduce
+ *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
+ *  The primary difference between the two functions is that <tt>std::accumulate</tt>
+ *  guarantees the order of summation, while \p reduce requires associativity of
+ *  \p binary_op to parallelize the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case \p binary_op) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \param binary_op The binary function used to 'sum' values.
+ *  \return The result of the reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \c InputIterator's \c value_type is convertible to \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to
+ *  compute the maximum value of a sequence of integers using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(thrust::host,
+ *                              data, data + 6,
+ *                              -1,
+ *                              thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see transform_reduce
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+           InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op);
+
+
+/*! \p reduce is a generalization of summation: it computes the sum (or some
+ *  other binary operation) of all the elements in the range <tt>[first,
+ *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
+ *  reduction and \p binary_op as the binary function used for summation. \p reduce
+ *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
+ *  The primary difference between the two functions is that <tt>std::accumulate</tt>
+ *  guarantees the order of summation, while \p reduce requires associativity of
+ *  \p binary_op to parallelize the reduction.
+ *
+ *  Note that \p reduce also assumes that the binary reduction operator (in this
+ *  case \p binary_op) is commutative.  If the reduction operator is not commutative
+ *  then \p thrust::reduce should not be used.  Instead, one could use 
+ *  \p inclusive_scan (which does not require commutativity) and select the
+ *  last element of the output array.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param init The initial value.
+ *  \param binary_op The binary function used to 'sum' values.
+ *  \return The result of the reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *          and \c InputIterator's \c value_type is convertible to \c T.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p reduce to
+ *  compute the maximum value of a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *  int result = thrust::reduce(data, data + 6,
+ *                              -1,
+ *                              thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see transform_reduce
+ */
+template<typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+  T reduce(InputIterator first,
+           InputIterator last,
+           T init,
+           BinaryFunction binary_op);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c equal_to
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c equal_to
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c plus and the result copied to \c values_output. 
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c plus to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
+ *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
+ *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
+ *  values are reduced to a single value with \c binary_op.
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c binary_op to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \param binary_op The binary function used to accumulate values.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int> binary_op;
+ *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred, binary_op);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op);
+
+
+/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p reduce_by_key copies the first element of the group to the
+ *  \c keys_output. The corresponding values in the range are reduced using the
+ *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
+ *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
+ *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
+ *  values are reduced to a single value with \c binary_op.
+ *
+ *  This version of \p reduce_by_key uses the function object \c binary_pred
+ *  to test for equality and \c binary_op to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_output The beginning of the output key range.
+ *  \param values_output The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \param binary_op The binary function used to accumulate values.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p reduce_by_key to
+ *  compact a sequence of key/value pairs and sum values with equal keys.
+ *
+ *  \code
+ *  #include <thrust/reduce.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int> binary_op;
+ *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred, binary_op);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
+ *  \endcode
+ *  
+ *  \see reduce
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see unique_by_key_copy
+ */
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op);
+
+
+/*! \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/reduce.inl>
+
diff --git a/compat/thrust/remove.h b/compat/thrust/remove.h
new file mode 100644
index 0000000..c538776
--- /dev/null
+++ b/compat/thrust/remove.h
@@ -0,0 +1,800 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Functions for removing elements from a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup stream_compaction Stream Compaction
+ *  \ingroup reordering
+ *  \{
+ *
+ */
+
+
+/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
+ *  equal to \p value. That is, \p remove returns an iterator \p new_last such
+ *  that the range <tt>[first, new_last)</tt> contains no elements equal to
+ *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified. \p remove
+ *  is stable, meaning that the relative order of elements that are not equal to
+ *  \p value is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param value The value to remove from the range <tt>[first, last)</tt>.
+ *         Elements which are equal to value are removed from the sequence.
+ *  \return A \p ForwardIterator pointing to the end of the resulting range of
+ *          elements which are not equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p remove to remove a number
+ *  of interest from a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {3, 1, 4, 1, 5, 9};
+ *  int *new_end = thrust::remove(A, A + N, 1);
+ *  // The first four values of A are now {3, 4, 5, 9}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
+ *  iterators, and does not change the distance between \p first and \p last.
+ *  (There's no way that it could do anything of the sort.) So, for example, if
+ *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
+ *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
+ *  before. \p remove returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see remove_if
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+
+/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
+ *  equal to \p value. That is, \p remove returns an iterator \p new_last such
+ *  that the range <tt>[first, new_last)</tt> contains no elements equal to
+ *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified. \p remove
+ *  is stable, meaning that the relative order of elements that are not equal to
+ *  \p value is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param value The value to remove from the range <tt>[first, last)</tt>.
+ *         Elements which are equal to value are removed from the sequence.
+ *  \return A \p ForwardIterator pointing to the end of the resulting range of
+ *          elements which are not equal to \p value.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p remove to remove a number
+ *  of interest from a range.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {3, 1, 4, 1, 5, 9};
+ *  int *new_end = thrust::remove(A, A + N, 1);
+ *  // The first four values of A are now {3, 4, 5, 9}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
+ *  iterators, and does not change the distance between \p first and \p last.
+ *  (There's no way that it could do anything of the sort.) So, for example, if
+ *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
+ *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
+ *  before. \p remove returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see remove_if
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+
+/*! \p remove_copy copies elements that are not equal to \p value from the range
+ *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
+ *  the end of the resulting range. This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as in
+ *  the range <tt>[first, last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param value The value to omit from the copied range.
+ *  \return An OutputIterator pointing to the end of the resulting range of elements
+ *          which are not equal to \p value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy to copy
+ *  a sequence of numbers to an output range while omitting a value of interest using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[N-2];
+ *  thrust::remove_copy(thrust::host, V, V + N, result, 0);
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, -1, 1, 2}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see remove
+ *  \see remove_if
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+
+/*! \p remove_copy copies elements that are not equal to \p value from the range
+ *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
+ *  the end of the resulting range. This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as in
+ *  the range <tt>[first, last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param value The value to omit from the copied range.
+ *  \return An OutputIterator pointing to the end of the resulting range of elements
+ *          which are not equal to \p value.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy to copy
+ *  a sequence of numbers to an output range while omitting a value of interest.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[N-2];
+ *  thrust::remove_copy(V, V + N, result, 0);
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-2, -1, 1, 2}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see remove
+ *  \see remove_if
+ *  \see remove_copy_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
+ *  no elements for which \p pred is \c true. The iterators in the range
+ *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
+ *  they point to are unspecified. \p remove_if is stable, meaning that the
+ *  relative order of elements that are not removed is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
+ *              \c false are removed from the sequence.
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  all even numbers from an array of integers using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int *new_end = thrust::remove_if(thrust::host, A, A + N, is_even());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
+ *  destroy any iterators, and does not change the distance between \p first and
+ *  \p last. (There's no way that it could do anything of the sort.) So, for
+ *  example, if \c V is a device_vector,
+ *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
+ *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
+ *  \p remove_if returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
+ *  no elements for which \p pred is \c true. The iterators in the range
+ *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
+ *  they point to are unspecified. \p remove_if is stable, meaning that the
+ *  relative order of elements that are not removed is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
+ *              \c false are removed from the sequence.
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  all even numbers from an array of integers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int *new_end = thrust::remove_if(A, A + N, is_even());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
+ *  destroy any iterators, and does not change the distance between \p first and
+ *  \p last. (There's no way that it could do anything of the sort.) So, for
+ *  example, if \c V is a device_vector,
+ *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
+ *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
+ *  \p remove_if returns an iterator that points to the end of the resulting
+ *  range after elements have been removed from it; it follows that the elements
+ *  after that iterator are of no interest, and may be discarded. If you are
+ *  removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  simply erase them. That is, a reasonable way of removing elements from a
+ *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred is
+ *  \c true are not copied. The return value is the end of the resulting range.
+ *  This operation is stable, meaning that the relative order of the elements that
+ *  are copied is the same as the range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting even numbers using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[2];
+ *  thrust::remove_copy_if(thrust::host, V, V + N, result, is_even());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred is
+ *  \c true are not copied. The return value is the end of the resulting range.
+ *  This operation is stable, meaning that the relative order of the elements that
+ *  are copied is the same as the range <tt>[first,last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting even numbers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(const int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int result[2];
+ *  thrust::remove_copy_if(V, V + N, result, is_even());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
+ *  no elements for which \p pred of the corresponding stencil value is \c true. 
+ *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
+ *  but the elements that they point to are unspecified. \p remove_if is stable,
+ *  meaning that the relative order of elements that are not removed is unchanged.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
+ *              \c false are removed from the sequence <tt>[first, last)</tt>
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  specific elements from an array of integers using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int S[N] = {0, 1, 1, 1, 0, 0};
+ *
+ *  int *new_end = thrust::remove(thrust::host, A, A + N, S, thrust::identity<int>());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
+ *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
+ *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
+ *  no elements for which \p pred of the corresponding stencil value is \c true. 
+ *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
+ *  but the elements that they point to are unspecified. \p remove_if is stable,
+ *  meaning that the relative order of elements that are not removed is unchanged.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred A predicate to evaluate for each element of the range
+ *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
+ *              \c false are removed from the sequence <tt>[first, last)</tt>
+ *  \return A ForwardIterator pointing to the end of the resulting range of
+ *          elements for which \p pred evaluated to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_if to remove
+ *  specific elements from an array of integers.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  int S[N] = {0, 1, 1, 1, 0, 0};
+ *
+ *  int *new_end = thrust::remove(A, A + N, S, thrust::identity<int>());
+ *  // The first three values of A are now {1, 5, 7}
+ *  // Values beyond new_end are unspecified
+ *  \endcode
+ *
+ *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_copy_if
+ */
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred of the 
+ *  corresponding stencil value is \c true are not copied. The return value is 
+ *  the end of the resulting range.  This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as the 
+ *  range <tt>[first,last)</tt>.
+ *
+ *  The algorithm's execution policy is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting specific elements using the \p thrust::host
+ *  execution policy for parallelization.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int S[N] = { 1, 1,  0, 1, 0, 1};
+ *  int result[2];
+ *  thrust::remove_copy_if(thrust::host, V, V + N, S, result, thrust::identity<int>());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ *  \see copy_if
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
+ *  range beginning at \p result, except that elements for which \p pred of the 
+ *  corresponding stencil value is \c true are not copied. The return value is 
+ *  the end of the resulting range.  This operation is stable, meaning that the
+ *  relative order of the elements that are copied is the same as the 
+ *  range <tt>[first,last)</tt>.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The resulting range is copied to the sequence beginning at this
+ *                location.
+ *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
+ *              Elements for which \p pred evaluates to \c false are not copied
+ *              to the resulting sequence.
+ *  \return An OutputIterator pointing to the end of the resulting range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p remove_copy_if to copy
+ *  a sequence of numbers to an output range while omitting specific elements.
+ *
+ *  \code
+ *  #include <thrust/remove.h>
+ *  ...
+ *  const int N = 6;
+ *  int V[N] = {-2, 0, -1, 0, 1, 2};
+ *  int S[N] = { 1, 1,  0, 1, 0, 1};
+ *  int result[2];
+ *  thrust::remove_copy_if(V, V + N, S, result, thrust::identity<int>());
+ *  // V remains {-2, 0, -1, 0, 1, 2}
+ *  // result is now {-1, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see remove
+ *  \see remove_copy
+ *  \see remove_if
+ *  \see copy_if
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/remove.inl>
+
diff --git a/compat/thrust/replace.h b/compat/thrust/replace.h
new file mode 100644
index 0000000..48e3e49
--- /dev/null
+++ b/compat/thrust/replace.h
@@ -0,0 +1,817 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file replace.h
+ *  \brief Functions for replacing elements in a range with a particular value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \addtogroup replacing
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! \p replace replaces every element in the range [first, last) equal to \p old_value
+ *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
+ *  then it performs the <tt>assignment *i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param old_value The value to replace.
+ *  \param new_value The new value to replace \p old_value.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *          objects of \p T may be compared for equality with objects of
+ *          \p ForwardIterator's \c value_type,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace to replace
+ *  a value of interest in a \c device_vector with another using the \p thrust::device
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::replace(thrust::device, A.begin(), A.end(), 1, 99);
+ *
+ *  // A contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               ForwardIterator first, ForwardIterator last,
+               const T &old_value,
+               const T &new_value);
+
+
+/*! \p replace replaces every element in the range [first, last) equal to \p old_value
+ *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
+ *  then it performs the <tt>assignment *i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param old_value The value to replace.
+ *  \param new_value The new value to replace \p old_value.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *          objects of \p T may be compared for equality with objects of
+ *          \p ForwardIterator's \c value_type,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace to replace
+ *  a value of interest in a \c device_vector with another.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::replace(A.begin(), A.end(), 1, 99);
+ *
+ *  // A contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename T>
+  void replace(ForwardIterator first, ForwardIterator last, const T &old_value,
+               const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
+ *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's negative elements with \c 0 using the \p thrust::device execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ *
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), pred, 0);
+ *
+ *  // A contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
+ *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's negative elements with \c 0.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ *
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(A.begin(), A.end(), pred, 0);
+ *
+ *  // A contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
+ *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
+ *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero
+ *  using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  is_less_than_zero pred;
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), pred, 0);
+ *
+ *  // A contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
+ *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
+ *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
+ *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
+ *
+ *  \param first The beginning of the sequence of interest.
+ *  \param last The end of the sequence of interest.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
+ *         to \c true.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p replace_if to replace
+ *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  is_less_than_zero pred;
+ *  thrust::replace_if(A.begin(), A.end(), S.begin(), pred, 0);
+ *
+ *  // A contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see \c replace
+ *  \see \c replace_copy
+ *  \see \c replace_copy_if
+ */
+template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(ForwardIterator first, ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
+ *  is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
+ *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
+ *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param old_value The value to replace.
+ *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          \p T may be compared for equality with \p InputIterator's \c value_type,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::device_vector<int> B(4);
+ *
+ *  thrust::replace_copy(thrust::device, A.begin(), A.end(), B.begin(), 1, 99);
+ *
+ *  // B contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see \c copy
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy_if
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first, InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value);
+
+
+/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
+ *  is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
+ *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
+ *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param old_value The value to replace.
+ *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *          \p T may be compared for equality with \p InputIterator's \c value_type,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> A(4);
+ *  A[0] = 1;
+ *  A[1] = 2;
+ *  A[2] = 3;
+ *  A[3] = 1;
+ *
+ *  thrust::device_vector<int> B(4);
+ *
+ *  thrust::replace_copy(A.begin(), A.end(), B.begin(), 1, 99);
+ *
+ *  // B contains [99, 2, 3, 99]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see \c copy
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy_if
+ */
+template<typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(InputIterator first, InputIterator last,
+                              OutputIterator result, const T &old_value,
+                              const T &new_value);
+
+
+/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
+ *  is \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that 0 <= n < last-first,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ 
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_copy_if(thrust::device, A.begin(), A.end(), B.begin(), pred, 0);
+ *
+ *  // B contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ */
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
+ *  is \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that 0 <= n < last-first,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
+ *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  1;
+ *  A[1] = -3;
+ *  A[2] =  2;
+ *  A[3] = -1;
+ 
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_copy_if(A.begin(), A.end(), B.begin(), pred, 0);
+ *
+ *  // B contains [1, 0, 2, 0]
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see \c replace
+ *  \see \c replace_if
+ *  \see \c replace_copy
+ */
+template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
+ *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
+ *
+ *  // B contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see \c replace_copy
+ *  \see \c replace_if
+ */
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                 InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
+ *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
+ *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
+ *
+ *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
+ *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
+ *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
+ *
+ *  \param first The beginning of the sequence to copy from.
+ *  \param last The end of the sequence to copy from.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the sequence to copy to.
+ *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
+ *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
+ *  \return <tt>result + (last-first)</tt>
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  \code
+ *  #include <thrust/replace.h>
+ *  #include <thrust/device_vector.h>
+ *
+ *  struct is_less_than_zero
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x < 0;
+ *    }
+ *  };
+ *  
+ *  ...
+ *  
+ *  thrust::device_vector<int> A(4);
+ *  A[0] =  10;
+ *  A[1] =  20;
+ *  A[2] =  30;
+ *  A[3] =  40;
+ *
+ *  thrust::device_vector<int> S(4);
+ *  S[0] = -1;
+ *  S[1] =  0;
+ *  S[2] = -1;
+ *  S[3] =  0;
+ *
+ *  thrust::device_vector<int> B(4);
+ *  is_less_than_zero pred;
+ *
+ *  thrust::replace_if(A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
+ *
+ *  // B contains [0, 20, 0, 40]
+ *  \endcode
+ *
+ *  \see \c replace_copy
+ *  \see \c replace_if
+ */
+template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+/*! \} // end replacing
+ *  \} // transformations
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/replace.inl>
+
diff --git a/compat/thrust/reverse.h b/compat/thrust/reverse.h
new file mode 100644
index 0000000..ba50c5d
--- /dev/null
+++ b/compat/thrust/reverse.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reverse.h
+ *  \brief Reverses the order of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reordering
+ *  \ingroup algorithms
+ */
+
+
+/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
+ *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
+ *  and <tt>*(last - (i + 1))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *          \p BidirectionalIterator is mutable.
+ *
+ *  The following code snippet demonstrates how to use \p reverse to reverse a
+ *  \p device_vector of integers using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> v(data, data + N);
+ *  thrust::reverse(thrust::device, v.begin(), v.end());
+ *  // v is now {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see \p reverse_copy
+ *  \see \p reverse_iterator
+ */
+template<typename DerivedPolicy, typename BidirectionalIterator>
+  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last);
+
+
+/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
+ *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
+ *  and <tt>*(last - (i + 1))</tt>.
+ *
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *          \p BidirectionalIterator is mutable.
+ *
+ *  The following code snippet demonstrates how to use \p reverse to reverse a
+ *  \p device_vector of integers.
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> v(data, data + N);
+ *  thrust::reverse(v.begin(), v.end());
+ *  // v is now {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see \p reverse_copy
+ *  \see \p reverse_iterator
+ */
+template<typename BidirectionalIterator>
+  void reverse(BidirectionalIterator first,
+               BidirectionalIterator last);
+
+
+/*! \p reverse_copy differs from \ref reverse only in that the reversed range
+ *  is written to a different output range, rather than inplace.
+ *
+ *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
+ *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
+ *  reverse of the original range. Specifically: for every <tt>i</tt> such that
+ *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
+ *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
+ *
+ *  The return value is <tt>result + (last - first))</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *  \param result The beginning of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p reverse_copy to reverse
+ *  an input \p device_vector of integers to an output \p device_vector using the \p thrust::device
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> input(data, data + N);
+ *  thrust::device_vector<int> output(N);
+ *  thrust::reverse_copy(thrust::device, v.begin(), v.end(), output.begin());
+ *  // input is still {0, 1, 2, 3, 4, 5}
+ *  // output is now  {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see \p reverse
+ *  \see \p reverse_iterator
+ */
+template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
+  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+
+/*! \p reverse_copy differs from \ref reverse only in that the reversed range
+ *  is written to a different output range, rather than inplace.
+ *
+ *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
+ *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
+ *  reverse of the original range. Specifically: for every <tt>i</tt> such that
+ *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
+ *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
+ *
+ *  The return value is <tt>result + (last - first))</tt>.
+ *
+ *  \param first The beginning of the range to reverse.
+ *  \param last The end of the range to reverse.
+ *  \param result The beginning of the output range.
+ *
+ *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p reverse_copy to reverse
+ *  an input \p device_vector of integers to an output \p device_vector.
+ *
+ *  \code
+ *  #include <thrust/reverse.h>
+ *  ...
+ *  const int N = 6;
+ *  int data[N] = {0, 1, 2, 3, 4, 5};
+ *  thrust::device_vector<int> input(data, data + N);
+ *  thrust::device_vector<int> output(N);
+ *  thrust::reverse_copy(v.begin(), v.end(), output.begin());
+ *  // input is still {0, 1, 2, 3, 4, 5}
+ *  // output is now  {5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see \p reverse
+ *  \see \p reverse_iterator
+ */
+template<typename BidirectionalIterator, typename OutputIterator>
+  OutputIterator reverse_copy(BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+
+/*! \} // end reordering
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/reverse.inl>
+
diff --git a/compat/thrust/scan.h b/compat/thrust/scan.h
new file mode 100644
index 0000000..95074e6
--- /dev/null
+++ b/compat/thrust/scan.h
@@ -0,0 +1,1552 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief Functions for computing prefix sums
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+
+/*! \addtogroup prefixsums Prefix Sums
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
+ *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
+ *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p inclusive_scan assumes plus as the associative operator.  
+ *  When the input and output sequences are the same, the scan is performed 
+ *  in-place.
+ 
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::inclusive_scan(thrust::host, data, data + 6, data); // in-place scan
+ *
+ *  // data is now {1, 1, 3, 5, 6, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
+ *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
+ *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p inclusive_scan assumes plus as the associative operator.  
+ *  When the input and output sequences are the same, the scan is performed 
+ *  in-place.
+ 
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::inclusive_scan(data, data + 6, data); // in-place scan
+ *
+ *  // data is now {1, 1, 3, 5, 6, 9}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum.  When the input and output sequences 
+ *  are the same, the scan is performed in-place.
+ *    
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::inclusive_scan(thrust::host, data, data + 10, data, binary_op); // in-place scan
+ *
+ *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op);
+
+
+/*! \p inclusive_scan computes an inclusive prefix sum operation. The
+ *  term 'inclusive' means that each result includes the corresponding
+ *  input operand in the partial sum.  When the input and output sequences 
+ *  are the same, the scan is performed in-place.
+ *    
+ *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
+ *  difference between the two functions is that \c std::partial_sum guarantees
+ *  a serial summation order, while \p inclusive_scan requires associativity of 
+ *  the binary operation to parallelize the prefix sum.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan
+ *
+ *  \code
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::inclusive_scan(data, data + 10, data, binary_op); // in-place scan
+ *
+ *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
+ *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
+ *  and so on. This version of \p exclusive_scan assumes plus as the 
+ *  associative operator and \c 0 as the initial value.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 6, data); // in-place scan
+ *
+ *  // data is now {0, 1, 1, 3, 5, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
+ *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
+ *  and so on. This version of \p exclusive_scan assumes plus as the 
+ *  associative operator and \c 0 as the initial value.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
+ *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
+ *                         defined.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(data, data + 6, data); // in-place scan
+ *
+ *  // data is now {0, 1, 1, 3, 5, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
+ *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p exclusive_scan assumes plus as the associative 
+ *  operator but requires an initial value \p init.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 6, data, 4); // in-place scan
+ *
+ *  // data is now {4, 5, 5, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
+ *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
+ *  This version of \p exclusive_scan assumes plus as the associative 
+ *  operator but requires an initial value \p init.  When the input and 
+ *  output sequences are the same, the scan is performed in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's
+ *                         \c value_type, then <tt>x + y</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::exclusive_scan(data, data + 6, data, 4); // in-place scan
+ *
+ *  // data is now {4, 5, 5, 7, 9, 10}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>\*result</tt> and the value
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. This version of the function requires both and associative 
+ *  operator and an initial value \p init.  When the input and output
+ *  sequences are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
+ *  prefix sum using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::exclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan
+ *
+ *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan computes an exclusive prefix sum operation. The
+ *  term 'exclusive' means that each result does not include the 
+ *  corresponding input operand in the partial sum.  More precisely,
+ *  \p init is assigned to <tt>\*result</tt> and the value
+ *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
+ *  and so on. This version of the function requires both and associative 
+ *  operator and an initial value \p init.  When the input and output
+ *  sequences are the same, the scan is performed in-place.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param init The initial value.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to
+ *                        \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *                         and \c OutputIterator's \c value_type is convertible to
+ *                         both \c AssociativeOperator's \c first_argument_type and
+ *                         \c second_argument_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::maximum<int> binary_op;
+ *
+ *  thrust::exclusive_scan(data, data + 10, data, 1, binary_op); // in-place scan
+ *
+ *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
+ *  \endcode
+ *  
+ *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op);
+
+
+/*! \addtogroup segmentedprefixsums Segmented Prefix Sums
+ *  \ingroup prefixsums
+ *  \{
+ */
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+ 
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec. 
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'inclusive' means that each result includes 
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate inclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p inclusive_scan_by_key uses the binary predicate 
+ *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
+ *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
+ *  different segments otherwise.
+ *
+ *  This version of \p inclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param binary_pred  The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
+ *
+ *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
+ *  \endcode
+ *
+ *  \see inclusive_scan
+ *  \see exclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c 0 to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ * 
+ *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
+ *  different segments otherwise.
+ *
+ *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals); // in-place scan
+ *
+ *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c 0 to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
+ *  operator used to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ * 
+ *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
+ *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
+ *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
+ *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
+ *  different segments otherwise.
+ *
+ *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key.
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals); // in-place scan
+ *
+ *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the \p
+ *  thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \return The end of the output sequence.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  This version of \p exclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
+ *  sum operation. The term 'exclusive' means that each result does not include
+ *  the corresponding input operand in the partial sum. The term 'segmented'
+ *  means that the partial sums are broken into distinct segments.  In other
+ *  words, within each segment a separate exclusive scan operation is computed.
+ *  Refer to the code sample below for example usage.
+ *
+ *  This version of \p exclusive_scan_by_key uses the value \c init to
+ *  initialize the exclusive scan operation.
+ *
+ *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
+ *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
+ *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
+ *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
+ *
+ *  This version of \p exclusive_scan_by_key uses the associative operator 
+ *  \c binary_op to perform the prefix sum. When the input and output sequences
+ *  are the same, the scan is performed in-place.
+ *
+ *  \param first1 The beginning of the key sequence.
+ *  \param last1 The end of the key sequence.
+ *  \param first2 The beginning of the input value sequence.
+ *  \param result The beginning of the output value sequence.
+ *  \param init The initial of the exclusive sum value.
+ *  \param binary_pred The binary predicate used to determine equality of keys.
+ *  \param binary_op The associatve operator used to 'sum' values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
+ *                         <tt>binary_op(x,y)</tt> is defined.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
+ *
+ *  \code
+ *  #include <thrust/scan.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
+ *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ *
+ *  int init = 5;
+ *
+ *  thrust::equal_to<int> binary_pred;
+ *  thrust::plus<int>     binary_op;
+ *
+ *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
+ *
+ *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
+ *  \endcode
+ *
+ *  \see exclusive_scan
+ *  \see inclusive_scan_by_key
+ *
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+/*! \} // end segmentedprefixsums
+ */
+
+
+/*! \} // end prefix sums
+ */
+
+	
+} // end namespace thrust
+
+#include <thrust/detail/scan.inl>
+
diff --git a/compat/thrust/scatter.h b/compat/thrust/scatter.h
new file mode 100644
index 0000000..59604ca
--- /dev/null
+++ b/compat/thrust/scatter.h
@@ -0,0 +1,420 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scatter.h
+ *  \brief Irregular copying to a destination range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup scattering
+ *  \ingroup copying
+ *  \{
+ */
+
+
+/*! \p scatter copies elements from a source range into an output array
+ *  according to a map. For each iterator \c i in the range [\p first, \p last),
+ *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
+ *  output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map  Beginning of the sequence of output indices.
+ *  \param result Destination of the source elements.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
+ *
+ *  The following code snippet demonstrates how to use \p scatter to
+ *  reorder a range using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // scatter all even indices into the first half of the
+ *  // range, and odd indices vice versa
+ *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::scatter(thrust::device,
+ *                  d_values.begin(), d_values.end(),
+ *                  d_map.begin(), d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ *
+ *  \note \p scatter is the inverse of thrust::gather.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator result);
+
+
+/*! \p scatter copies elements from a source range into an output array
+ *  according to a map. For each iterator \c i in the range [\p first, \p last),
+ *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
+ *  output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map  Beginning of the sequence of output indices.
+ *  \param result Destination of the source elements.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
+ *
+ *  The following code snippet demonstrates how to use \p scatter to
+ *  reorder a range.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  // mark even indices with a 1; odd indices with a 0
+ *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ *  thrust::device_vector<int> d_values(values, values + 10);
+ *
+ *  // scatter all even indices into the first half of the
+ *  // range, and odd indices vice versa
+ *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
+ *  thrust::device_vector<int> d_map(map, map + 10);
+ *
+ *  thrust::device_vector<int> d_output(10);
+ *  thrust::scatter(d_values.begin(), d_values.end(),
+ *                  d_map.begin(), d_output.begin());
+ *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
+ *  \endcode
+ *
+ *  \note \p scatter is the inverse of thrust::gather.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator result);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
+ *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
+ *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  ...
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  thrust::scatter_if(V, V + 8, M, S, D);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
+ *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  is_even pred;
+ *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D, pred);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *  
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+                  
+
+/*! \p scatter_if conditionally copies elements from a source range into an 
+ *  output array according to a map. For each iterator \c i in the 
+ *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
+ *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
+ *  The output iterator must permit random access. If the same index 
+ *  appears more than once in the range <tt>[map, map + (last - first))</tt>
+ *  the result is undefined.
+ *
+ *  \param first Beginning of the sequence of values to scatter.
+ *  \param last End of the sequence of values to scatter.
+ *  \param map Beginning of the sequence of output indices.
+ *  \param stencil Beginning of the sequence of predicate values.
+ *  \param output Beginning of the destination range.
+ *  \param pred Predicate to apply to the stencil values.
+ *
+ *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
+ *
+ *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
+ *
+ *  \code
+ *  #include <thrust/scatter.h>
+ *
+ *  struct is_even
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return (x % 2) == 0;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+ *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
+ *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
+ *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ * 
+ *  is_even pred;
+ *  thrust::scatter_if(V, V + 8, M, S, D, pred);
+ * 
+ *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
+ *  \endcode
+ *  
+ *  \note \p scatter_if is the inverse of thrust::gather_if.
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+
+
+/*! \} // end scattering
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/scatter.inl>
+
diff --git a/compat/thrust/sequence.h b/compat/thrust/sequence.h
new file mode 100644
index 0000000..6c54a5b
--- /dev/null
+++ b/compat/thrust/sequence.h
@@ -0,0 +1,293 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sequence.h
+ *  \brief Fills a range with a sequence of numbers
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10);
+ *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10);
+ *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator>
+  void sequence(ForwardIterator first,
+                ForwardIterator last);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10, 1);
+ *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10, 1);
+ *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers
+ *  \param step The difference between consecutive elements.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 with a step size of 3 using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(thrust::host, A, A + 10, 1, 3);
+ *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
+ *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param init The first value of the sequence of numbers
+ *  \param step The difference between consecutive elements.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and \p T is convertible to \p ForwardIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p sequence to fill a range
+ *  with a sequence of numbers starting from the value 1 with a step size of 3.
+ *
+ *  \code
+ *  #include <thrust/sequence.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::sequence(A, A + 10, 1, 3);
+ *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
+ *  \endcode
+ *
+ *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
+ *        guarantee on order of execution.
+ *
+ *  \see http://www.sgi.com/tech/stl/iota.html
+ */
+template<typename ForwardIterator, typename T>
+  void sequence(ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/sequence.inl>
+
diff --git a/compat/thrust/set_operations.h b/compat/thrust/set_operations.h
new file mode 100644
index 0000000..a7ee624
--- /dev/null
+++ b/compat/thrust/set_operations.h
@@ -0,0 +1,2947 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file set_operations.h
+ *  \brief Set theoretic operations for sorted ranges
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup set_operations Set Operations
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in ascending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result is now {0, 4, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result is now {0, 4, 6}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result is now {6, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                InputIterator1                                              first1,
+                                InputIterator1                                              last1,
+                                InputIterator2                                              first2,
+                                InputIterator2                                              last2,
+                                OutputIterator                                              result,
+                                StrictWeakCompare                                           comp);
+
+
+/*! \p set_difference constructs a sorted range that is the set difference of the sorted
+ *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_difference performs the "difference" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[first1, last1)</tt> range shall be copied to the output range.
+ *
+ *  This version of \p set_difference compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference to compute the
+ *  set difference of two sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result is now {6, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakCompare comp);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares objects using
+ *  \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute the
+ *  set intersection of two sets of integers sorted in ascending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[7];
+ *
+ *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result);
+ *  // result is now {1, 3, 5}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares objects using
+ *  \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute the
+ *  set intersection of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {1, 3, 5, 7, 9, 11};
+ *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int result[7];
+ *
+ *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result);
+ *  // result is now {1, 3, 5}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute
+ *  the set intersection of sets of integers sorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result is now {5, 3, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  InputIterator1                                              first1,
+                                  InputIterator1                                              last1,
+                                  InputIterator2                                              first2,
+                                  InputIterator2                                              last2,
+                                  OutputIterator                                              result,
+                                  StrictWeakCompare                                           comp);
+
+
+/*! \p set_intersection constructs a sorted range that is the
+ *  intersection of sorted ranges <tt>[first1, last1)</tt> and
+ *  <tt>[first2, last2)</tt>. The return value is the end of the
+ *  output range.
+ *
+ *  In the simplest case, \p set_intersection performs the
+ *  "intersection" operation from set theory: the output range
+ *  contains a copy of every element that is contained in both
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
+ *  general case is more complicated, because the input ranges may
+ *  contain duplicate elements. The generalization is that if a value
+ *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
+ *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the output range.
+ *  \p set_intersection is stable, meaning that both elements are
+ *  copied from the first range rather than the second, and that the
+ *  relative order of elements in the output range is the same as in
+ *  the first input range.
+ *
+ *  This version of \p set_intersection compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection to compute
+ *  the set intersection of sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {11, 9, 7, 5, 3, 1};
+ *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
+ *
+ *  int result[3];
+ *
+ *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
+ *  // result is now {5, 3, 1}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakCompare comp);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in ascending order using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A2[5] = {1, 1, 2, 5, 8};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 4, 5, 6, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A2[5] = {1, 1, 2, 5, 8};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 4, 5, 6, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in descending order using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A2[5] = {8, 5, 2, 1, 1};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {8, 7, 6, 5, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator1                                              first1,
+                                          InputIterator1                                              last1,
+                                          InputIterator2                                              first2,
+                                          InputIterator2                                              last2,
+                                          OutputIterator                                              result,
+                                          StrictWeakCompare                                           comp);
+
+
+/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
+ *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
+ *  The return value is the end of the output range.
+ *
+ *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
+ *  the symmetric difference of two sets of integers sorted in descending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A2[5] = {8, 5, 2, 1, 1};
+ *
+ *  int result[6];
+ *
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {8, 7, 6, 5, 4, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_difference
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakCompare comp);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using \c operator<.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A1[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A2[5] = {1, 3, 5, 7, 9};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(A1, A1 + 6, A2, A2 + 5, result);
+ *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A1[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1                                              first1,
+                           InputIterator1                                              last1,
+                           InputIterator2                                              first2,
+                           InputIterator2                                              last2,
+                           OutputIterator                                              result,
+                           StrictWeakCompare                                           comp);
+
+
+/*! \p set_union constructs a sorted range that is the union of the sorted ranges
+ *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
+ *  end of the output range.
+ *
+ *  In the simplest case, \p set_union performs the "union" operation from set
+ *  theory: the output range contains a copy of every element that is contained in
+ *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  This version of \p set_union compares elements using a function object \p comp.
+ *
+ *  \param first1 The beginning of the first input range.
+ *  \param last1 The end of the first input range.
+ *  \param first2 The beginning of the second input range.
+ *  \param last2 The end of the second input range.
+ *  \param result The beginning of the output range.
+ *  \param comp Comparison operator.
+ *  \return The end of the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting range shall not overlap with either input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_union to compute the union of
+ *  two sets of integers sorted in ascending order.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A1[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A2[5] = {9, 7, 5, 3, 1};
+ *
+ *  int result[11];
+ *
+ *  int *result_end = thrust::set_union(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see \p merge
+ *  \see \p includes
+ *  \see \p set_union
+ *  \see \p set_intersection
+ *  \see \p set_symmetric_difference
+ *  \see \p sort
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakCompare>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakCompare comp);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in ascending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in descending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          InputIterator1                                              keys_first1,
+                          InputIterator1                                              keys_last1,
+                          InputIterator2                                              keys_first2,
+                          InputIterator2                                              keys_last2,
+                          InputIterator3                                              values_first1,
+                          InputIterator4                                              values_first2,
+                          OutputIterator1                                             keys_result,
+                          OutputIterator2                                             values_result,
+                          StrictWeakCompare                                           comp);
+
+
+/*! \p set_difference_by_key performs a key-value difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
+ *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
+ *  set difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[3];
+ *  int vals_result[3];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {0, 4, 6}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result,
+                          StrictWeakCompare                          comp);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in ascending order with their values using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
+ *  
+ *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
+ *
+ *  // keys_result is now {1, 3, 5}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
+ *  
+ *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
+ *
+ *  // keys_result is now {1, 3, 5}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *  
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result is now {5, 3, 1}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                            InputIterator1                                              keys_first1,
+                            InputIterator1                                              keys_last1,
+                            InputIterator2                                              keys_first2,
+                            InputIterator2                                              keys_last2,
+                            InputIterator3                                              values_first1,
+                            OutputIterator1                                             keys_result,
+                            OutputIterator2                                             values_result,
+                            StrictWeakCompare                                           comp);
+
+
+/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
+ *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
+ *  theory: the keys output range contains a copy of every element that is contained in both
+ *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
+ *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
+ *  appears <tt>min(m,n)</tt> times in the keys output range.
+ *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
+ *  input range rather than the second, and that the relative order of elements in the output range
+ *  is the same as the first input range.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
+ *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
+ *  output range.
+ *
+ *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
+ *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
+ *  set intersection of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
+ *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
+ *  
+ *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
+ *
+ *  int keys_result[7];
+ *  int vals_result[7];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
+ *
+ *  // keys_result is now {5, 3, 1}
+ *  // vals_result is now {0, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result,
+                            StrictWeakCompare                          comp);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 1, 2, 5, 8};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 5, 6, 7, 8}
+ *  // vals_result is now {0, 0, 1, 0, 0, 1}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {1, 1, 2, 5, 8};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 4, 5, 6, 7, 8}
+ *  // vals_result is now {0, 0, 1, 0, 0, 1}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {8, 5, 2, 1, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {8, 7, 6, 5, 4, 0}
+ *  // vals_result is now {1, 0, 0, 1, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                    InputIterator1                                              keys_first1,
+                                    InputIterator1                                              keys_last1,
+                                    InputIterator2                                              keys_first2,
+                                    InputIterator2                                              keys_last2,
+                                    InputIterator3                                              values_first1,
+                                    InputIterator4                                              values_first2,
+                                    OutputIterator1                                             keys_result,
+                                    OutputIterator2                                             values_result,
+                                    StrictWeakCompare                                           comp);
+
+
+/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
+ *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
+ *  it constructs the union of the two sets A - B and B - A, where A and B are the two
+ *  input ranges. That is, the output range contains a copy of every element that is
+ *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
+ *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
+ *  The general case is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
+ *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
+ *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
+ *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
+ *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {8, 5, 2, 1, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[6];
+ *  int vals_result[6];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {8, 7, 6, 5, 4, 0}
+ *  // vals_result is now {1, 0, 0, 1, 0, 0}
+ *  \endcode
+ *
+ *  \see \p set_union_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result,
+                                    StrictWeakCompare                          comp);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using \c operator<.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in ascending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  ...
+ *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
+ *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
+ *
+ *  int B_keys[5] = {1, 3, 5, 7, 9};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
+ *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
+ *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using a function object \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                     InputIterator1                                              keys_first1,
+                     InputIterator1                                              keys_last1,
+                     InputIterator2                                              keys_first2,
+                     InputIterator2                                              keys_last2,
+                     InputIterator3                                              values_first1,
+                     InputIterator4                                              values_first2,
+                     OutputIterator1                                             keys_result,
+                     OutputIterator2                                             values_result,
+                     StrictWeakCompare                                           comp);
+
+
+/*! \p set_union_by_key performs a key-value union operation from set theory.
+ *  \p set_union_by_key constructs a sorted range that is the union of the sorted
+ *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
+ *  with each element from the input and output key ranges is a value element. The associated input
+ *  value ranges need not be sorted.
+ *
+ *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
+ *  the output range contains a copy of every element that is contained in
+ *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
+ *  is more complicated, because the input ranges may contain duplicate elements.
+ *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
+ *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
+ *  elements that are equivalent to them, then all \c m elements from the first
+ *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
+ *  elements from the second range shall be copied to the output, in order.
+ *
+ *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
+ *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
+ *  corresponding value element is copied from the corresponding values input range (beginning at
+ *  \p values_first1 or \p values_first2) to the values output range.
+ *
+ *  This version of \p set_union_by_key compares key elements using a function object \c comp.
+ *
+ *  \param keys_first1 The beginning of the first input range of keys.
+ *  \param keys_last1 The end of the first input range of keys.
+ *  \param keys_first2 The beginning of the second input range of keys.
+ *  \param keys_last2 The end of the second input range of keys.
+ *  \param values_first1 The beginning of the first input range of values.
+ *  \param values_first2 The beginning of the first input range of values.
+ *  \param keys_result The beginning of the output range of keys.
+ *  \param values_result The beginning of the output range of values.
+ *  \param comp Comparison operator.
+ *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
+ *          and such that <tt>p.second</tt> is the end of the output range of values.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
+ *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
+ *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
+ *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
+ *  \pre The resulting ranges shall not overlap with any input range.
+ *
+ *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
+ *  symmetric difference of two sets of integers sorted in descending order with their values.
+ *
+ *  \code
+ *  #include <thrust/set_operations.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
+ *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
+ *
+ *  int B_keys[5] = {9, 7, 5, 3, 1};
+ *  int B_vals[5] = {1, 1, 1, 1, 1};
+ *
+ *  int keys_result[11];
+ *  int vals_result[11];
+ *
+ *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
+ *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
+ *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
+ *  \endcode
+ *
+ *  \see \p set_symmetric_difference_by_key
+ *  \see \p set_intersection_by_key
+ *  \see \p set_difference_by_key
+ *  \see \p sort_by_key
+ *  \see \p is_sorted
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakCompare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result,
+                     StrictWeakCompare                          comp);
+
+
+/*! \} // end set_operations
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/set_operations.inl>
+
diff --git a/compat/thrust/sort.h b/compat/thrust/sort.h
new file mode 100644
index 0000000..e8edfcd
--- /dev/null
+++ b/compat/thrust/sort.h
@@ -0,0 +1,1349 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sort.h
+ *  \brief Functions for reorganizing ranges into sorted order
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup sorting
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(thrust::host, A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using \c operator<.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename RandomAccessIterator>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using a function object
+ *  \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp  Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(thrust::host, A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+/*! \p sort sorts the elements in <tt>[first, last)</tt> into
+ *  ascending order, meaning that if \c i and \c j are any two valid
+ *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
+ *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
+ *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
+ *  neither one is less than the other. It is not guaranteed that the
+ *  relative order of these two elements will be preserved by \p sort.
+ *
+ *  This version of \p sort compares objects using a function object
+ *  \p comp.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp  Comparison operator.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort
+ *  \see \p sort_by_key
+ */
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::stable_sort(thrust::host, A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using \c operator<.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *  The following code snippet demonstrates how to use \p sort to sort
+ *  a sequence of integers.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::stable_sort(A, A + N);
+ *  // A is now {1, 2, 4, 5, 7, 8}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename RandomAccessIterator>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using a function object
+ *  \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+/*! \p stable_sort is much like \c sort: it sorts the elements in
+ *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
+ *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
+ *  that \c i precedes \c j, then \c *j is not less than \c *i.
+ *
+ *  As the name suggests, \p stable_sort is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort compares objects using a function object
+ *  \p comp.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator is mutable,
+ *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code demonstrates how to sort integers in descending order
+ *  using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 6;
+ *  int A[N] = {1, 4, 2, 8, 5, 7};
+ *  thrust::sort(A, A + N, thrust::greater<int>());
+ *  // A is now {8, 7, 5, 4, 2, 1};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see \p sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+///////////////
+// Key Value //
+///////////////
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(thrust::host, keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using \c operator<.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using a function object
+ *  \c comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
+ *  for parallelization.The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
+ *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
+ *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
+ *  guaranteed that the relative order of these two keys or the relative
+ *  order of their corresponding values will be preserved by \p sort_by_key.
+ *
+ *  This version of \p sort_by_key compares key objects using a function object
+ *  \c comp.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.  The keys
+ *  are sorted in descending order using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::sort_by_key(keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p stable_sort_by_key
+ *  \see \p sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using \c operator<.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
+ *  an array of characters using integers as sorting keys using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using \c operator<.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
+ *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
+ *  an array of characters using integers as sorting keys.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(keys, keys + N, values);
+ *  // keys is now   {  1,   2,   4,   5,   7,   8}
+ *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using the function
+ *  object \p comp.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys using the \p thrust::host execution policy for
+ *  parallelization. The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
+ *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
+ *  values_first + (keys_last - keys_first))</tt> into ascending key order,
+ *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
+ *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
+ *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
+ *  corresponding to \c i and \c j respectively, then \c *j is not less than
+ *  \c *i.
+ *
+ *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
+ *  relative ordering of equivalent elements. That is, if \c x and \c y
+ *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
+ *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
+ *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
+ *  still precedes \c y.
+ *
+ *  This version of \p stable_sort_by_key compares key objects using the function
+ *  object \p comp.
+ *
+ *  \param keys_first The beginning of the key sequence.
+ *  \param keys_last The end of the key sequence.
+ *  \param values_first The beginning of the value sequence.
+ *  \param comp Comparison operator.
+ *
+ *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *          \p RandomAccessIterator1 is mutable,
+ *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
+ *          \c first_argument_type and \c second_argument_type.
+ *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          and \p RandomAccessIterator2 is mutable.
+ *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
+ *
+ *  The following code snippet demonstrates how to use \p sort_by_key to sort
+ *  an array of character values using integers as sorting keys.  The keys
+ *  are sorted in descending order using the greater<int> comparison operator.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  ...
+ *  const int N = 6;
+ *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
+ *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
+ *  thrust::stable_sort_by_key(keys, keys + N, values, thrust::greater<int>());
+ *  // keys is now   {  8,   7,   5,   4,   2,   1}
+ *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
+ *  \endcode
+ *
+ *
+ *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ */
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+/*! \} // end sorting
+ */
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup predicates
+ *  \{
+ */
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
+ *  sorted in ascending order, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for
+ *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
+ *  expression <tt>*(i + 1) < *i</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return \c true, if the sequence is sorted; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
+ *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *
+ *  The following code demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in ascending order using the \p thrust::device execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end());
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end());
+ *  result = thrust::is_sorted(thrust::device, v.begin(), v.end());
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see is_sorted_until
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
+ *  sorted in ascending order, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for
+ *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
+ *  expression <tt>*(i + 1) < *i</tt> is \c true.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \return \c true, if the sequence is sorted; \c false, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
+ *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *
+ *
+ *  The following code demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in ascending order.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/sort.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  bool result = thrust::is_sorted(v.begin(), v.end());
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end());
+ *  result = thrust::is_sorted(v.begin(), v.end());
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see is_sorted_until
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename ForwardIterator>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
+ *  order accoring to a user-defined comparison operation, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
+ *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp  Comparison operator.
+ *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
+ *          and \c second_argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in descending order using the \p thrust::device execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  thrust::greater<int> comp;
+ *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end(), comp);
+ *  result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
+ *  order accoring to a user-defined comparison operation, and \c false otherwise.
+ *
+ *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
+ *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last  The end of the sequence.
+ *  \param comp  Comparison operator.
+ *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
+ *          and \c second_argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted to test whether the
+ *  contents of a \c device_vector are stored in descending order.
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v(6);
+ *  v[0] = 1;
+ *  v[1] = 4;
+ *  v[2] = 2;
+ *  v[3] = 8;
+ *  v[4] = 5;
+ *  v[5] = 7;
+ *
+ *  thrust::greater<int> comp;
+ *  bool result = thrust::is_sorted(v.begin(), v.end(), comp);
+ *
+ *  // result == false
+ *
+ *  thrust::sort(v.begin(), v.end(), comp);
+ *  result = thrust::is_sorted(v.begin(), v.end(), comp);
+ *
+ *  // result == true
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see \c sort
+ *  \see \c stable_sort
+ *  \see \c less<T>
+ */
+template<typename ForwardIterator, typename Compare>
+  bool is_sorted(ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted using the \p thrust::host execution policy for
+ *  parallelization:
+ *  
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+ *  
+ *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename ForwardIterator>
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted:
+ *  
+ *  \code
+ *  #include <thrust/sort.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
+ *  
+ *  int * B = thrust::is_sorted_until(A, A + 8);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename ForwardIterator>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization:
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param comp The function object to use for comparison.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted in descending order using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
+ *  
+ *  thrust::greater<int> comp;
+ *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8, comp);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted in descending order
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
+  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
+ *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
+ *  \p is_sorted_until simply returns \p last.
+ *
+ *  \param first The beginning of the range of interest.
+ *  \param last The end of the range of interest.
+ *  \param comp The function object to use for comparison.
+ *  \return The last iterator in the input range for which it is sorted.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
+ *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *
+ *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
+ *  in an array where the data becomes unsorted in descending order:
+ *
+ *  \code
+ *  #include <thrust/sort.h>
+ *  #include <thrust/functional.h>
+ *
+ *  ...
+ *   
+ *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
+ *  
+ *  thrust::greater<int> comp;
+ *  int * B = thrust::is_sorted_until(A, A + 8, comp);
+ *  
+ *  // B - A is 4
+ *  // [A, B) is sorted in descending order
+ *  \endcode
+ *
+ *  \see \p is_sorted
+ *  \see \p sort
+ *  \see \p sort_by_key
+ *  \see \p stable_sort
+ *  \see \p stable_sort_by_key
+ */
+template<typename ForwardIterator, typename Compare>
+  ForwardIterator is_sorted_until(ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+/*! \} // end predicates
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/sort.inl>
+
diff --git a/compat/thrust/swap.h b/compat/thrust/swap.h
new file mode 100644
index 0000000..085e546
--- /dev/null
+++ b/compat/thrust/swap.h
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file swap.h
+ *  \brief Functions for swapping the value of elements
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+// empty Doxygen comment below so namespace thrust's documentation will be extracted
+
+/*!
+ */
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup swap
+ *  \{
+ */
+
+/*! \p swap assigns the contents of \c a to \c b and the
+ *  contents of \c b to \c a. This is used as a primitive operation
+ *  by many other algorithms.
+ *  
+ *  \param a The first value of interest. After completion,
+ *           the value of b will be returned here.
+ *  \param b The second value of interest. After completion,
+ *           the value of a will be returned here.
+ *
+ *  \tparam Assignable is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p swap to
+ *  swap the contents of two variables.
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  ...
+ *  int x = 1;
+ *  int y = 2;
+ *  thrust::swap(x,h);
+ *
+ *  // x == 2, y == 1
+ *  \endcode
+ */
+template<typename Assignable1, typename Assignable2>
+__host__ __device__ 
+inline void swap(Assignable1 &a, Assignable2 &b);
+
+/*! \} // swap
+ */
+
+/*! \} // utility
+ */
+
+
+/*! \addtogroup copying
+ *  \{
+ */
+
+
+/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
+ *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
+ *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
+ *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
+ *  <tt>first2 + (last1 - first1)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first sequence to swap.
+ *  \param last1 One position past the last element of the first sequence to swap.
+ *  \param first2 The beginning of the second sequence to swap.
+ *  \return An iterator pointing to one position past the last element of the second
+ *          sequence to swap.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
+ *
+ *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p swap_ranges to
+ *  swap the contents of two \c thrust::device_vectors using the \p thrust::device execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  thrust::device_vector<int> v1(2), v2(2);
+ *  v1[0] = 1;
+ *  v1[1] = 2;
+ *  v2[0] = 3;
+ *  v2[1] = 4;
+ *
+ *  thrust::swap_ranges(thrust::device, v1.begin(), v1.end(), v2.begin());
+ *
+ *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see \c swap
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+
+/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
+ *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
+ *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
+ *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
+ *  <tt>first2 + (last1 - first1)</tt>.
+ *
+ *  \param first1 The beginning of the first sequence to swap.
+ *  \param last1 One position past the last element of the first sequence to swap.
+ *  \param first2 The beginning of the second sequence to swap.
+ *  \return An iterator pointing to one position past the last element of the second
+ *          sequence to swap.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
+ *
+ *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p swap_ranges to
+ *  swap the contents of two \c thrust::device_vectors.
+ *
+ *  \code
+ *  #include <thrust/swap.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> v1(2), v2(2);
+ *  v1[0] = 1;
+ *  v1[1] = 2;
+ *  v2[0] = 3;
+ *  v2[1] = 4;
+ *
+ *  thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
+ *
+ *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see \c swap
+ */
+template<typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+
+/*! \} // copying
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/swap.inl>
+
diff --git a/compat/thrust/system/cpp/detail/adjacent_difference.h b/compat/thrust/system/cpp/detail/adjacent_difference.h
new file mode 100644
index 0000000..ea212ff
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/adjacent_difference.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief C++ implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/adjacent_difference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &,
+                                   InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::adjacent_difference(first, last, result, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/assign_value.h b/compat/thrust/system/cpp/detail/assign_value.h
new file mode 100644
index 0000000..847fc97
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/assign_value.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+  void assign_value(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
+{
+  *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
+} // end assign_value()
+
+} // end detail
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/binary_search.h b/compat/thrust/system/cpp/detail/binary_search.h
new file mode 100644
index 0000000..37af539
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/binary_search.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief C++ implementation of binary search algorithms.
+ */
+
+#pragma once
+
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/binary_search.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template <typename ForwardIterator,
+          typename T,
+          typename StrictWeakOrdering>
+ForwardIterator lower_bound(tag,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val,
+                            StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::lower_bound(first, last, val, comp);
+}
+
+
+template <typename ForwardIterator,
+          typename T,
+          typename StrictWeakOrdering>
+ForwardIterator upper_bound(tag,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val, 
+                            StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::upper_bound(first, last, val, comp);
+}
+
+template <typename ForwardIterator,
+          typename T,
+          typename StrictWeakOrdering>
+bool binary_search(tag,
+                   ForwardIterator first,
+                   ForwardIterator last,
+                   const T& val, 
+                   StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::binary_search(first, last, val, comp);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/copy.h b/compat/thrust/system/cpp/detail/copy.h
new file mode 100644
index 0000000..7299bbb
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/copy.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy.h
+ *  \brief C++ implementations of copy functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(tag,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  return thrust::system::detail::internal::scalar::copy(first, last, result);
+}
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(tag,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  return thrust::system::detail::internal::scalar::copy_n(first, n, result);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/copy_if.h b/compat/thrust/system/cpp/detail/copy_if.h
new file mode 100644
index 0000000..2faadfa
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/copy_if.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/copy_if.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(tag,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::copy_if(first, last, stencil, result, pred);
+}
+
+} // end detail
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/count.h b/compat/thrust/system/cpp/detail/count.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/count.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/dispatch/sort.h b/compat/thrust/system/cpp/detail/dispatch/sort.h
new file mode 100644
index 0000000..2a03cf6
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/dispatch/sort.h
@@ -0,0 +1,119 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/reverse.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/detail/internal/scalar/stable_merge_sort.h>
+#include <thrust/system/detail/internal/scalar/stable_radix_sort.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+namespace dispatch
+{
+
+////////////////
+// Radix Sort //
+////////////////
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp,
+                 thrust::detail::true_type)
+{
+  thrust::system::detail::internal::scalar::stable_radix_sort(first, last);
+        
+  // if comp is greater<T> then reverse the keys
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+
+  if (reverse)
+    thrust::reverse(first, last);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp,
+                        thrust::detail::true_type)
+{
+  // if comp is greater<T> then reverse the keys and values
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+
+  // note, we also have to reverse the (unordered) input to preserve stability
+  if (reverse)
+  {
+    thrust::reverse(first1,  last1);
+    thrust::reverse(first2, first2 + (last1 - first1));
+  }
+
+  thrust::system::detail::internal::scalar::stable_radix_sort_by_key(first1, last1, first2);
+
+  if (reverse)
+  {
+    thrust::reverse(first1,  last1);
+    thrust::reverse(first2, first2 + (last1 - first1));
+  }
+}
+
+////////////////
+// Merge Sort //
+////////////////
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp,
+                 thrust::detail::false_type)
+{
+  thrust::system::detail::internal::scalar::stable_merge_sort(first, last, comp);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp,
+                        thrust::detail::false_type)
+{
+  thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, last1, first2, comp);
+}
+
+} // end namespace dispatch
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/equal.h b/compat/thrust/system/cpp/detail/equal.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/equal.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/execution_policy.h b/compat/thrust/system/cpp/detail/execution_policy.h
new file mode 100644
index 0000000..229ff5c
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/execution_policy.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace cpp
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::execution_policy<Derived>
+{
+  // allow conversion to tag
+  inline operator tag () const
+  {
+    return tag();
+  }
+};
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::cpp::detail::execution_policy;
+using thrust::system::cpp::detail::tag;
+
+} // end cpp
+} // end system
+
+// alias items at top-level
+namespace cpp
+{
+
+using thrust::system::cpp::execution_policy;
+using thrust::system::cpp::tag;
+
+} // end cpp
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/extrema.h b/compat/thrust/system/cpp/detail/extrema.h
new file mode 100644
index 0000000..3eab6d4
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/extrema.h
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file extrema.h
+ *  \brief C++ implementations of extrema functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/extrema.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(execution_policy<DerivedPolicy> &,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  return thrust::system::detail::internal::scalar::min_element(first, last, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(execution_policy<DerivedPolicy> &,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  return thrust::system::detail::internal::scalar::max_element(first, last, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  return thrust::system::detail::internal::scalar::minmax_element(first, last, comp);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/fill.h b/compat/thrust/system/cpp/detail/fill.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/find.h b/compat/thrust/system/cpp/detail/find.h
new file mode 100644
index 0000000..9698524
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/find.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief C++ implementation of find_if. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/find.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template <typename InputIterator,
+          typename Predicate>
+InputIterator find_if(tag,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::find_if(first, last, pred);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/for_each.h b/compat/thrust/system/cpp/detail/for_each.h
new file mode 100644
index 0000000..8d4e1c7
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/for_each.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
+                       InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  return thrust::system::detail::internal::scalar::for_each(first, last, f);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
+                         InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  return thrust::system::detail::internal::scalar::for_each_n(first, n, f);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/gather.h b/compat/thrust/system/cpp/detail/gather.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/gather.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/generate.h b/compat/thrust/system/cpp/detail/generate.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/generate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/get_value.h b/compat/thrust/system/cpp/detail/get_value.h
new file mode 100644
index 0000000..5ddb2c8
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/get_value.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  return *thrust::raw_pointer_cast(ptr);
+} // end get_value()
+
+
+} // end detail
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/inner_product.h b/compat/thrust/system/cpp/detail/inner_product.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/inner_product.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/iter_swap.h b/compat/thrust/system/cpp/detail/iter_swap.h
new file mode 100644
index 0000000..257276f
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/iter_swap.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/swap.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+template<typename Pointer1, typename Pointer2>
+__host__ __device__
+  void iter_swap(tag, Pointer1 a, Pointer2 b)
+{
+  using thrust::swap;
+  swap(*thrust::raw_pointer_cast(a), *thrust::raw_pointer_cast(b));
+} // end iter_swap()
+
+
+} // end detail
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/logical.h b/compat/thrust/system/cpp/detail/logical.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/logical.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/malloc_and_free.h b/compat/thrust/system/cpp/detail/malloc_and_free.h
new file mode 100644
index 0000000..4f8ae82
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/malloc_and_free.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <cstdlib> // for malloc & free
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+// note that malloc returns a raw pointer to avoid
+// depending on the heavyweight thrust/system/cpp/memory.h header
+template<typename DerivedPolicy>
+  void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
+{
+  return std::malloc(n);
+} // end malloc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void free(execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  std::free(thrust::raw_pointer_cast(ptr));
+} // end free()
+
+
+} // end detail
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/memory.inl b/compat/thrust/system/cpp/detail/memory.inl
new file mode 100644
index 0000000..7f9a48d
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/memory.inl
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <limits>
+
+namespace thrust
+{
+
+// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+namespace detail
+{
+
+template<typename T>
+  struct pointer_raw_pointer< thrust::cpp::pointer<T> >
+{
+  typedef typename thrust::cpp::pointer<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+
+} // end detail
+#endif
+
+namespace system
+{
+namespace cpp
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+pointer<void> malloc(std::size_t n)
+{
+  tag t;
+  return pointer<void>(thrust::system::cpp::detail::malloc(t, n));
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::cpp::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+void free(pointer<void> ptr)
+{
+  tag t;
+  return thrust::system::cpp::detail::free(t, ptr);
+} // end free()
+
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/merge.h b/compat/thrust/system/cpp/detail/merge.h
new file mode 100644
index 0000000..7f01c07
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/merge.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/merge.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(execution_policy<DerivedPolicy> &,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::merge(first1, last1, first2, last2, result, comp);
+}
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(execution_policy<DerivedPolicy> &,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first1,
+               InputIterator4 values_first2,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::merge_by_key(keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/mismatch.h b/compat/thrust/system/cpp/detail/mismatch.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/mismatch.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/par.h b/compat/thrust/system/cpp/detail/par.h
new file mode 100644
index 0000000..953e527
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/par.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/execute_with_allocator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::cpp::detail::execution_policy<par_t>
+{
+  par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
+
+  template<typename Allocator>
+    thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>
+      operator()(Allocator &alloc) const
+  {
+    return thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>(alloc);
+  }
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end cpp
+} // end system
+
+
+// alias par here
+namespace cpp
+{
+
+
+using thrust::system::cpp::par;
+
+
+} // end cpp
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/detail/partition.h b/compat/thrust/system/cpp/detail/partition.h
new file mode 100644
index 0000000..25a4f1c
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/partition.h
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief cpp implementations of partition functions
+ */
+
+#pragma once
+
+#include <thrust/pair.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/partition.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(tag,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::stable_partition(first, last, pred);
+}
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(tag,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::stable_partition(first, last, stencil, pred);
+}
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(tag,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::stable_partition_copy(first, last, out_true, out_false, pred);
+}
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(tag,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::stable_partition_copy(first, last, stencil, out_true, out_false, pred);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/reduce.h b/compat/thrust/system/cpp/detail/reduce.h
new file mode 100644
index 0000000..5428206
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief C++ implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/reduce.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::reduce(begin, end, init, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/reduce_by_key.h b/compat/thrust/system/cpp/detail/reduce_by_key.h
new file mode 100644
index 0000000..22dc2d9
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/reduce_by_key.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/internal/scalar/reduce_by_key.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::reduce_by_key(keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/remove.h b/compat/thrust/system/cpp/detail/remove.h
new file mode 100644
index 0000000..cf2202b
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/remove.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(tag,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::remove_if(first, last, pred);
+}
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(tag,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::remove_if(first, last, stencil, pred);
+}
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::remove_copy_if(first, last, result, pred);
+}
+
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(tag,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::system::detail::internal::scalar::remove_copy_if(first, last, stencil, result, pred);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/replace.h b/compat/thrust/system/cpp/detail/replace.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/replace.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/reverse.h b/compat/thrust/system/cpp/detail/reverse.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/reverse.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/scan.h b/compat/thrust/system/cpp/detail/scan.h
new file mode 100644
index 0000000..d4bae1e
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/scan.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief C++ implementations of scan functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::inclusive_scan(first, last, result, binary_op);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::exclusive_scan(first, last, result, init, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/scan_by_key.h b/compat/thrust/system/cpp/detail/scan_by_key.h
new file mode 100644
index 0000000..4165d84
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/scan_by_key.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/scan_by_key.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan_by_key(tag,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::inclusive_scan_by_key(first1, last1, first2, result, binary_pred, binary_op);
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan_by_key(tag,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  return thrust::system::detail::internal::scalar::exclusive_scan_by_key(first1, last1, first2, result, init, binary_pred, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/scatter.h b/compat/thrust/system/cpp/detail/scatter.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/scatter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/sequence.h b/compat/thrust/system/cpp/detail/sequence.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/sequence.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/set_operations.h b/compat/thrust/system/cpp/detail/set_operations.h
new file mode 100644
index 0000000..07ce712
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/set_operations.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/set_operations.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(execution_policy<ExecutionPolicy> &,
+                                InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::set_difference(first1, last1, first2, last2, result, comp);
+}
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(execution_policy<ExecutionPolicy> &,
+                                  InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::set_intersection(first1, last1, first2, last2, result, comp);
+}
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(execution_policy<ExecutionPolicy> &,
+                                          InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::set_symmetric_difference(first1, last1, first2, last2, result, comp);
+}
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(execution_policy<ExecutionPolicy> &,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakOrdering comp)
+{
+  return thrust::system::detail::internal::scalar::set_union(first1, last1, first2, last2, result, comp);
+}
+
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/sort.h b/compat/thrust/system/cpp/detail/sort.h
new file mode 100644
index 0000000..60244e2
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/sort.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/sort.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(execution_policy<DerivedPolicy> &,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  thrust::system::detail::internal::scalar::stable_sort(first, last, comp);
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  thrust::system::detail::internal::scalar::stable_sort_by_key(keys_first, keys_last, values_first, comp);
+}
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/swap_ranges.h b/compat/thrust/system/cpp/detail/swap_ranges.h
new file mode 100644
index 0000000..a834a2c
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/swap_ranges.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cpp has no special swap_ranges
+
diff --git a/compat/thrust/system/cpp/detail/tabulate.h b/compat/thrust/system/cpp/detail/tabulate.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/tabulate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/temporary_buffer.h b/compat/thrust/system/cpp/detail/temporary_buffer.h
new file mode 100644
index 0000000..628bd75
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/compat/thrust/system/cpp/detail/transform.h b/compat/thrust/system/cpp/detail/transform.h
new file mode 100644
index 0000000..5909d4a
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/transform.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cpp has no special transform
+
diff --git a/compat/thrust/system/cpp/detail/transform_reduce.h b/compat/thrust/system/cpp/detail/transform_reduce.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/transform_reduce.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/transform_scan.h b/compat/thrust/system/cpp/detail/transform_scan.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/transform_scan.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/uninitialized_copy.h b/compat/thrust/system/cpp/detail/uninitialized_copy.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/uninitialized_copy.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/uninitialized_fill.h b/compat/thrust/system/cpp/detail/uninitialized_fill.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/uninitialized_fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cpp/detail/unique.h b/compat/thrust/system/cpp/detail/unique.h
new file mode 100644
index 0000000..cf74049
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/unique.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/internal/scalar/unique.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  return thrust::system::detail::internal::scalar::unique(first, last, binary_pred);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  return thrust::system::detail::internal::scalar::unique_copy(first, last, output, binary_pred);
+}
+
+} // end namespace detail
+} // end namespace cpp 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/unique_by_key.h b/compat/thrust/system/cpp/detail/unique_by_key.h
new file mode 100644
index 0000000..a9f13d6
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/unique_by_key.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/internal/scalar/unique_by_key.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  return thrust::system::detail::internal::scalar::unique_by_key(keys_first, keys_last, values_first, binary_pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  return thrust::system::detail::internal::scalar::unique_by_key_copy(keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
+}
+
+} // end namespace detail
+} // end namespace cpp 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cpp/detail/vector.inl b/compat/thrust/system/cpp/detail/vector.inl
new file mode 100644
index 0000000..03bffcd
--- /dev/null
+++ b/compat/thrust/system/cpp/detail/vector.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/vector.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector()
+      : super_t()
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n)
+      : super_t(n)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n, const value_type &value)
+      : super_t(n,value)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(const vector &x)
+      : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const std::vector<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename InputIterator>
+    vector<T,Allocator>
+      ::vector(InputIterator first, InputIterator last)
+        : super_t(first,last)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+      
+} // end cpp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cpp/execution_policy.h b/compat/thrust/system/cpp/execution_policy.h
new file mode 100644
index 0000000..f192eb9
--- /dev/null
+++ b/compat/thrust/system/cpp/execution_policy.h
@@ -0,0 +1,157 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/cpp/execution_policy.h
+ *  \brief Execution policies for Thrust's standard C++ system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/cpp/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/cpp/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/cpp/detail/adjacent_difference.h>
+#include <thrust/system/cpp/detail/assign_value.h>
+#include <thrust/system/cpp/detail/binary_search.h>
+#include <thrust/system/cpp/detail/copy.h>
+#include <thrust/system/cpp/detail/copy_if.h>
+#include <thrust/system/cpp/detail/count.h>
+#include <thrust/system/cpp/detail/equal.h>
+#include <thrust/system/cpp/detail/extrema.h>
+#include <thrust/system/cpp/detail/fill.h>
+#include <thrust/system/cpp/detail/find.h>
+#include <thrust/system/cpp/detail/for_each.h>
+#include <thrust/system/cpp/detail/gather.h>
+#include <thrust/system/cpp/detail/generate.h>
+#include <thrust/system/cpp/detail/get_value.h>
+#include <thrust/system/cpp/detail/inner_product.h>
+#include <thrust/system/cpp/detail/iter_swap.h>
+#include <thrust/system/cpp/detail/logical.h>
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <thrust/system/cpp/detail/merge.h>
+#include <thrust/system/cpp/detail/mismatch.h>
+#include <thrust/system/cpp/detail/partition.h>
+#include <thrust/system/cpp/detail/reduce.h>
+#include <thrust/system/cpp/detail/reduce_by_key.h>
+#include <thrust/system/cpp/detail/remove.h>
+#include <thrust/system/cpp/detail/replace.h>
+#include <thrust/system/cpp/detail/reverse.h>
+#include <thrust/system/cpp/detail/scan.h>
+#include <thrust/system/cpp/detail/scan_by_key.h>
+#include <thrust/system/cpp/detail/scatter.h>
+#include <thrust/system/cpp/detail/sequence.h>
+#include <thrust/system/cpp/detail/set_operations.h>
+#include <thrust/system/cpp/detail/sort.h>
+#include <thrust/system/cpp/detail/swap_ranges.h>
+#include <thrust/system/cpp/detail/tabulate.h>
+#include <thrust/system/cpp/detail/transform.h>
+#include <thrust/system/cpp/detail/transform_reduce.h>
+#include <thrust/system/cpp/detail/transform_scan.h>
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+#include <thrust/system/cpp/detail/unique.h>
+#include <thrust/system/cpp/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::system::cpp::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's standard C++ backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p thrust::system::cpp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p cpp::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p cpp system.
+ */
+struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
+
+
+/*! 
+ *  \p thrust::system::cpp::par is the parallel execution policy associated with Thrust's standard
+ *  C++ backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's C++ backend system by providing \p thrust::cpp::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::cpp::vector.
+ *
+ *  The type of \p thrust::cpp::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::cpp::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the standard C++ backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/cpp/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::cpp::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end cpp
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/compat/thrust/system/cpp/memory.h b/compat/thrust/system/cpp/memory.h
new file mode 100644
index 0000000..f3a58b8
--- /dev/null
+++ b/compat/thrust/system/cpp/memory.h
@@ -0,0 +1,414 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cpp/memory.h
+ *  \brief Managing memory associated with Thrust's standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+
+template<typename> class pointer;
+
+} // end cpp
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize std::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace std
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::cpp::pointer<Element> >
+{
+  private:
+    typedef thrust::system::cpp::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end std
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::cpp
+ *  \brief \p thrust::system::cpp is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's standard C++ backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::cpp</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace cpp
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::cpp::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the cpp system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in cpp memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p cpp::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cpp::malloc
+ *  \see cpp::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::cpp::tag,
+               thrust::system::cpp::reference<T>,
+               thrust::system::cpp::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::cpp::tag,
+      //thrust::system::cpp::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::cpp::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that cpp::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p cpp system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p cpp system.
+ *  \p reference is the type of the result of dereferencing a \p cpp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::cpp::pointer<T>,
+               thrust::system::cpp::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::cpp::pointer<T>,
+      thrust::system::cpp::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference ot interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+/*! Allocates an area of memory available to Thrust's <tt>cpp</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>cpp::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>cpp::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cpp::pointer<void></tt> returned by this function must be
+ *        deallocated with \p cpp::free.
+ *  \see cpp::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>cpp</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>cpp::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated elements. A null <tt>cpp::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cpp::pointer<T></tt> returned by this function must be
+ *        deallocated with \p cpp::free.
+ *  \see cpp::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>cpp::malloc</tt>.
+ *  \param ptr A <tt>cpp::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>cpp::malloc</tt>.
+ *  \see cpp::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+// XXX upon c++11
+// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
+
+/*! \p cpp::allocator is the default allocator used by the \p cpp system's containers such as
+ *  <tt>cpp::vector</tt> if no user-specified allocator is provided. \p cpp::allocator allocates
+ *  (deallocates) storage with \p cpp::malloc (\p cpp::free).
+ */
+template<typename T>
+  struct allocator
+    : thrust::detail::malloc_allocator<
+        T,
+        tag,
+        pointer<T>
+      >
+{
+  /*! The \p rebind metafunction provides the type of an \p allocator
+   *  instantiated with another type.
+   *
+   *  \tparam U The other type to use for instantiation.
+   */
+  template<typename U>
+    struct rebind
+  {
+    /*! The typedef \p other gives the type of the rebound \p allocator.
+     */
+    typedef allocator<U> other;
+  };
+
+  /*! No-argument constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator() {}
+
+  /*! Copy constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator(const allocator &) {}
+
+  /*! Constructor from other \p allocator has no effect.
+   */
+  template<typename U>
+  __host__ __device__
+  inline allocator(const allocator<U> &) {}
+
+  /*! Destructor has no effect.
+   */
+  __host__ __device__
+  inline ~allocator() {}
+}; // end allocator
+
+} // end cpp
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::cpp
+ *  \brief \p thrust::cpp is a top-level alias for thrust::system::cpp.
+ */
+namespace cpp
+{
+
+using thrust::system::cpp::pointer;
+using thrust::system::cpp::reference;
+using thrust::system::cpp::malloc;
+using thrust::system::cpp::free;
+using thrust::system::cpp::allocator;
+
+} // end cpp
+
+} // end thrust
+
+#include <thrust/system/cpp/detail/memory.inl>
+
diff --git a/compat/thrust/system/cpp/vector.h b/compat/thrust/system/cpp/vector.h
new file mode 100644
index 0000000..4282df9
--- /dev/null
+++ b/compat/thrust/system/cpp/vector.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cpp/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Allocator> class host_vector;
+
+namespace system
+{
+namespace cpp
+{
+
+// XXX upon c++11
+// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
+
+/*! \p cpp::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p cpp::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p cpp::vector reside in memory
+ *  available to the \p cpp system.
+ *
+ *  \tparam T The element type of the \p cpp::vector.
+ *  \tparam Allocator The allocator type of the \p cpp::vector. Defaults to \p cpp::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cpp::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+  class vector
+    : public thrust::detail::vector_base<T,Allocator>
+{
+  /*! \cond
+   */
+  private:
+    typedef thrust::detail::vector_base<T,Allocator> super_t;
+  /*! \endcond
+   */
+
+  public:
+
+  /*! \cond
+   */
+    typedef typename super_t::size_type  size_type;
+    typedef typename super_t::value_type value_type;
+
+  /*! \endcond
+   */
+
+    /*! This constructor creates an empty \p cpp::vector.
+     */
+    vector();
+
+    /*! This constructor creates a \p cpp::vector with \p n default-constructed elements.
+     *  \param n The size of the \p cpp::vector to create.
+     */
+    explicit vector(size_type n);
+
+    /*! This constructor creates a \p cpp::vector with \p n copies of \p value.
+     *  \param n The size of the \p cpp::vector to create.
+     *  \param value An element to copy.
+     */
+    explicit vector(size_type n, const value_type &value);
+
+    /*! Copy constructor copies from another \p cpp::vector.
+     *  \param x The other \p cpp::vector to copy.
+     */
+    vector(const vector &x);
+
+    /*! This constructor copies from another Thrust vector-like object.
+     *  \param x The other object to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+
+    /*! This constructor copies from a \c std::vector.
+     *  \param x The \c std::vector to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! This constructor creates a \p cpp::vector by copying from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+
+    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
+
+    /*! Assignment operator assigns from a \c std::vector.
+     *  \param x The \c std::vector to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! Assignment operator assigns from another Thrust vector-like object.
+     *  \param x The other object to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+}; // end vector
+
+} // end cpp
+} // end system
+
+// alias system::cpp names at top-level
+namespace cpp
+{
+
+using thrust::system::cpp::vector;
+
+} // end cpp
+
+} // end thrust
+
+#include <thrust/system/cpp/detail/vector.inl>
+
diff --git a/compat/thrust/system/cuda/detail/adjacent_difference.h b/compat/thrust/system/cuda/detail/adjacent_difference.h
new file mode 100644
index 0000000..ec51794
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/adjacent_difference.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief CUDA implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/adjacent_difference.inl>
+
diff --git a/compat/thrust/system/cuda/detail/adjacent_difference.inl b/compat/thrust/system/cuda/detail/adjacent_difference.inl
new file mode 100644
index 0000000..9e4756a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/adjacent_difference.inl
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/gather.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/detail/internal/decompose.h>
+#include <thrust/system/cuda/detail/default_decomposition.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/launch_calculator.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template <typename Decomposition>
+struct last_index_in_each_interval : public thrust::unary_function<typename Decomposition::index_type, typename Decomposition::index_type>
+{
+  typedef typename Decomposition::index_type index_type;
+
+  Decomposition decomp;
+
+  last_index_in_each_interval(Decomposition decomp) : decomp(decomp) {}
+
+  __host__ __device__
+  index_type operator()(index_type interval)
+  {
+    return decomp[interval].end() - 1;
+  }
+};
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition,
+          typename Context>
+struct adjacent_difference_closure
+{
+  InputIterator1 input;
+  InputIterator2 input_copy;
+  OutputIterator output;
+  BinaryFunction binary_op;
+  Decomposition  decomp;
+  Context        context;
+
+  typedef Context context_type;
+  
+  adjacent_difference_closure(InputIterator1 input,
+                              InputIterator2 input_copy,
+                              OutputIterator output,
+                              BinaryFunction binary_op,
+                              Decomposition  decomp,
+                              Context        context = Context())
+    : input(input), input_copy(input_copy), output(output), binary_op(binary_op), decomp(decomp), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename thrust::iterator_value<InputIterator1>::type  InputType;
+    typedef typename Decomposition::index_type index_type;
+
+    // this block processes results in [range.begin(), range.end())
+    thrust::system::detail::internal::index_range<index_type> range = decomp[context.block_index()];
+    
+    input_copy += context.block_index() - 1;
+      
+    // prime the temp values for all threads so we don't need to launch a default constructor
+    InputType next_left = (context.block_index() == 0) ? *input : *input_copy;
+
+    index_type base = range.begin();
+    index_type i    = range.begin() + context.thread_index();
+    
+    if (i < range.end())
+    {
+      if (context.thread_index() > 0)
+      {
+        InputIterator1 temp = input + (i - 1);
+        next_left = *temp;
+      }              
+    }
+    
+    input  += i;
+    output += i;
+
+    while (base < range.end())
+    {
+      InputType curr_left = next_left;
+
+      if (i + context.block_dimension() < range.end())
+      {
+        InputIterator1 temp = input + (context.block_dimension() - 1);
+        next_left = *temp;
+      }
+
+      context.barrier();
+
+      if (i < range.end())
+      {
+        if (i == 0)
+          *output = *input;
+        else
+        {
+          InputType x = *input;
+          *output = binary_op(x, curr_left);
+        }
+      }
+
+      i      += context.block_dimension();
+      base   += context.block_dimension();
+      input  += context.block_dimension();
+      output += context.block_dimension();
+    }
+  }
+};
+
+} // end namespace detail
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type                        InputType;
+  typedef typename thrust::iterator_difference<InputIterator>::type                   IndexType;
+  typedef          thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
+
+  IndexType n = last - first;
+
+  if (n == 0)
+    return result;
+
+  Decomposition decomp = default_decomposition(last - first);
+
+  // allocate temporary storage
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, decomp.size() - 1);
+
+  // gather last value in each interval
+  detail::last_index_in_each_interval<Decomposition> unary_op(decomp);
+  thrust::gather(exec,
+                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op),
+                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op) + (decomp.size() - 1),
+                 first,
+                 temp.begin());
+
+  
+  typedef typename thrust::detail::temporary_array<InputType,DerivedPolicy>::iterator InputIterator2;
+  typedef detail::blocked_thread_array Context;
+  typedef detail::adjacent_difference_closure<InputIterator,InputIterator2,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
+
+  Closure closure(first, temp.begin(), result, binary_op, decomp); 
+
+  detail::launch_closure(closure, decomp.size());
+  
+  return result + n;
+}
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/assign_value.h b/compat/thrust/system/cuda/detail/assign_value.h
new file mode 100644
index 0000000..c90cf65
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/assign_value.h
@@ -0,0 +1,198 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
+//     symbols resulting from assign_value
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+
+namespace
+{
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value_msvc2005_war(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+    {
+      thrust::copy(exec, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
+    {
+      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
+    }
+  };
+
+#ifndef __CUDA_ARCH__
+  war_nvbugs_881631::host_path(exec,dst,src);
+#else
+  war_nvbugs_881631::device_path(exec,dst,src);
+#endif // __CUDA_ARCH__
+} // end assign_value_msvc2005_war()
+
+} // end anon namespace
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+{
+  return assign_value_msvc2005_war(exec,dst,src);
+} // end assign_value()
+
+#else
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
+    {
+      thrust::copy(exec, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
+    {
+      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
+    }
+  };
+
+#ifndef __CUDA_ARCH__
+  war_nvbugs_881631::host_path(exec,dst,src);
+#else
+  war_nvbugs_881631::device_path(exec,dst,src);
+#endif // __CUDA_ARCH__
+} // end assign_value()
+
+#endif // msvc 2005 WAR
+
+
+// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
+//     symbols resulting from assign_value
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+
+namespace
+{
+
+
+template<typename System1, typename System2, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value_msvc2005_war(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    {
+      // rotate the systems so that they are ordered the same as (src, dst)
+      // for the call to thrust::copy
+      cross_system<System2,System1> rotated_systems = systems.rotate();
+      thrust::copy(rotated_systems, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    {
+      // XXX forward the true cuda::execution_policy inside systems here
+      //     instead of materializing a tag
+      thrust::cuda::tag cuda_tag;
+      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
+    }
+  };
+
+#if __CUDA_ARCH__
+  war_nvbugs_881631::device_path(systems,dst,src);
+#else
+  war_nvbugs_881631::host_path(systems,dst,src);
+#endif
+} // end assign_value_msvc2005_war
+
+
+} // end anon namespace
+
+
+template<typename System1, typename System2, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+{
+  return assign_value_msvc2005_war(systems,dst,src);
+} // end assign_value()
+
+
+#else
+
+
+template<typename System1, typename System2, typename Pointer1, typename Pointer2>
+inline __host__ __device__
+  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    {
+      // rotate the systems so that they are ordered the same as (src, dst)
+      // for the call to thrust::copy
+      cross_system<System2,System1> rotated_systems = systems.rotate();
+      thrust::copy(rotated_systems, src, src + 1, dst);
+    }
+
+    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    {
+      // XXX forward the true cuda::execution_policy inside systems here
+      //     instead of materializing a tag
+      thrust::cuda::tag cuda_tag;
+      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
+    }
+  };
+
+#if __CUDA_ARCH__
+  war_nvbugs_881631::device_path(systems,dst,src);
+#else
+  war_nvbugs_881631::host_path(systems,dst,src);
+#endif
+} // end assign_value()
+
+
+#endif // msvc 2005 WAR
+
+  
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/binary_search.h b/compat/thrust/system/cuda/detail/binary_search.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/binary_search.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/block/copy.h b/compat/thrust/system/cuda/detail/block/copy.h
new file mode 100644
index 0000000..9cc786b
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/copy.h
@@ -0,0 +1,223 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy.h
+ *  \brief CUDA implementation of device-to-device copy,
+ *         based on Gregory Diamos' memcpy code.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/pair.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/dispatch/is_trivial_copy.h>
+#include <thrust/detail/raw_reference_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+namespace trivial_copy_detail
+{
+
+
+template<typename Size>
+  inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
+{
+  Size quotient  = n / d;
+  Size remainder = n - d * quotient; 
+  return thrust::make_pair(quotient,remainder);
+} // end quotient_and_remainder()
+
+
+// assumes the addresses dst & src are aligned to T boundaries
+template<typename Context,
+         typename T>
+__device__ __thrust_forceinline__
+void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
+{
+  for(unsigned int i = context.thread_index();
+      i < num_elements;
+      i += context.block_dimension())
+  {
+    dst[i] = src[i];
+  }
+} // end aligned_copy()
+
+
+} // end namespace trivial_copy_detail
+
+
+template <typename Context>
+__device__ __thrust_forceinline__
+void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
+{
+  // reinterpret at bytes
+  char* destination  = reinterpret_cast<char*>(destination_);
+  const char* source = reinterpret_cast<const char*>(source_);
+ 
+  // TODO replace this with uint64
+#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
+  typedef long long  int2;
+  typedef long long uint2;
+#endif // THRUST_DEVICE_COMPILER_NVCC
+
+  // check alignment
+  // XXX can we do this in three steps?
+  //     1. copy until alignment is met
+  //     2. go hog wild
+  //     3. get the remainder
+  if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
+  {
+    for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
+    {
+      destination[i] = source[i];
+    }
+  }
+  else
+  {
+    // it's aligned; do a wide copy
+
+    // this pair stores the number of int2s in the aligned portion of the arrays
+    // and the number of bytes in the remainder
+    const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
+
+    // copy int2 elements
+    trivial_copy_detail::aligned_copy(context,
+                                      reinterpret_cast<int2*>(destination),
+                                      reinterpret_cast<const int2*>(source),
+                                      num_wide_elements_and_remainder_bytes.first);
+
+    // XXX we could copy int elements here
+
+    // copy remainder byte by byte
+
+    // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
+    // this is sizeof(int2) times the number of int2s comprising the aligned portion
+    const char *remainder_first  = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
+          char *remainder_result = reinterpret_cast<char*>(destination  + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
+
+    trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
+  }
+} // end trivial_copy()
+
+
+namespace detail
+{
+namespace dispatch
+{
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  __thrust_forceinline__ __device__
+  RandomAccessIterator2 copy(Context context,
+                             RandomAccessIterator1 first,
+                             RandomAccessIterator1 last,
+                             RandomAccessIterator2 result,
+                             thrust::detail::true_type is_trivial_copy)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
+
+  const T *src = &thrust::raw_reference_cast(*first);
+        T *dst = &thrust::raw_reference_cast(*result);
+
+  size_t n = (last - first);
+  thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
+  return result + n;
+} // end copy()
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  __thrust_forceinline__ __device__
+  RandomAccessIterator2 copy(Context context, 
+                             RandomAccessIterator1 first,
+                             RandomAccessIterator1 last,
+                             RandomAccessIterator2 result,
+                             thrust::detail::false_type is_trivial_copy)
+{
+  RandomAccessIterator2 end_of_output = result + (last - first);
+  
+  // advance iterators
+  first  += context.thread_index();
+  result += context.thread_index();
+
+  for(;
+      first < last;
+      first  += context.block_dimension(),
+      result += context.block_dimension())
+  {
+    *result = *first;
+  } // end for
+
+  return end_of_output;
+} // end copy()
+
+} // end namespace dispatch
+} // end namespace detail
+
+template<typename Context, 
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  __thrust_forceinline__ __device__
+  RandomAccessIterator2 copy(Context context,
+                             RandomAccessIterator1 first,
+                             RandomAccessIterator1 last,
+                             RandomAccessIterator2 result)
+{
+  return detail::dispatch::copy(context, first, last, result,
+#if __CUDA_ARCH__ < 200
+      // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
+      thrust::detail::false_type()
+#else
+      typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
+#endif
+      );
+} // end copy()
+
+
+template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
+inline __device__
+RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
+{
+  for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
+  {
+    result[i] = first[i];
+  }
+
+  ctx.barrier();
+
+  return result + n;
+}
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/exclusive_scan.h b/compat/thrust/system/cuda/detail/block/exclusive_scan.h
new file mode 100644
index 0000000..580a757
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/exclusive_scan.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/detail/block/inclusive_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+
+template<typename Context, typename RandomAccessIterator, typename T, typename BinaryFunction>
+inline __device__
+typename thrust::iterator_value<RandomAccessIterator>::type
+  inplace_exclusive_scan(Context &ctx, RandomAccessIterator first, T init, BinaryFunction op)
+{
+  // perform an inclusive scan, then shift right
+  block::inplace_inclusive_scan(ctx, first, op);
+
+  typename thrust::iterator_value<RandomAccessIterator>::type carry = first[ctx.block_dimension() - 1];
+
+  ctx.barrier();
+
+  typename thrust::iterator_value<RandomAccessIterator>::type left = (ctx.thread_index() == 0) ? init : first[ctx.thread_index() - 1];
+
+  ctx.barrier();
+
+  first[ctx.thread_index()] = left;
+
+  ctx.barrier();
+
+  return carry;
+}
+
+
+template<typename Context, typename Iterator, typename T>
+inline __device__
+  typename thrust::iterator_value<Iterator>::type
+    inplace_exclusive_scan(Context &ctx, Iterator first, T init)
+{
+  return block::inplace_exclusive_scan(ctx, first, init, thrust::plus<typename thrust::iterator_value<Iterator>::type>());
+}
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/inclusive_scan.h b/compat/thrust/system/cuda/detail/block/inclusive_scan.h
new file mode 100644
index 0000000..012f7cd
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/inclusive_scan.h
@@ -0,0 +1,191 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+template<typename Context,
+         typename InputIterator,
+         typename BinaryFunction>
+__device__ __thrust_forceinline__
+void inclusive_scan(Context context,
+                    InputIterator first,
+                    BinaryFunction binary_op)
+{
+  // TODO generalize to arbitrary n
+  // TODO support dynamic block_size
+  const unsigned int block_size = Context::ThreadsPerBlock::value;
+
+  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
+
+  if(block_size >    1) { if (context.thread_index() >=    1) { val = binary_op(first[context.thread_index() -    1], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >    2) { if (context.thread_index() >=    2) { val = binary_op(first[context.thread_index() -    2], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); } 
+  if(block_size >    4) { if (context.thread_index() >=    4) { val = binary_op(first[context.thread_index() -    4], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >    8) { if (context.thread_index() >=    8) { val = binary_op(first[context.thread_index() -    8], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   16) { if (context.thread_index() >=   16) { val = binary_op(first[context.thread_index() -   16], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   32) { if (context.thread_index() >=   32) { val = binary_op(first[context.thread_index() -   32], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   64) { if (context.thread_index() >=   64) { val = binary_op(first[context.thread_index() -   64], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  128) { if (context.thread_index() >=  128) { val = binary_op(first[context.thread_index() -  128], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  256) { if (context.thread_index() >=  256) { val = binary_op(first[context.thread_index() -  256], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  512) { if (context.thread_index() >=  512) { val = binary_op(first[context.thread_index() -  512], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+  if(block_size > 1024) { if (context.thread_index() >= 1024) { val = binary_op(first[context.thread_index() - 1024], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
+} // end inclusive_scan()
+
+
+template<typename Context,
+         typename InputIterator,
+         typename Size,
+         typename BinaryFunction>
+__device__ __thrust_forceinline__
+void inclusive_scan_n(Context context,
+                      InputIterator first,
+                      Size n,
+                      BinaryFunction binary_op)
+{
+  // TODO support n > context.block_dimension()
+  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
+
+  for (unsigned int i = 1; i < n; i <<= 1)
+  {
+    if (context.thread_index() < n && context.thread_index() >= i)
+      val = binary_op(first[context.thread_index() - i], val);
+
+    context.barrier();
+    
+    first[context.thread_index()] = val;
+    
+    context.barrier();
+  }
+} // end inclusive_scan()
+
+
+template<typename Context,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename BinaryFunction>
+__device__ __thrust_forceinline__
+void inclusive_scan_by_flag(Context context,
+                            InputIterator1 first1,
+                            InputIterator2 first2,
+                            BinaryFunction binary_op)
+{
+  // TODO generalize to arbitrary n
+  // TODO support dynamic block_size
+  const unsigned int block_size = Context::ThreadsPerBlock::value;
+
+  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
+  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
+
+  if(block_size >    1) { if (context.thread_index() >=    1) { if (!flg) { flg |= first1[context.thread_index() -    1]; val = binary_op(first2[context.thread_index() -    1], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >    2) { if (context.thread_index() >=    2) { if (!flg) { flg |= first1[context.thread_index() -    2]; val = binary_op(first2[context.thread_index() -    2], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); } 
+  if(block_size >    4) { if (context.thread_index() >=    4) { if (!flg) { flg |= first1[context.thread_index() -    4]; val = binary_op(first2[context.thread_index() -    4], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >    8) { if (context.thread_index() >=    8) { if (!flg) { flg |= first1[context.thread_index() -    8]; val = binary_op(first2[context.thread_index() -    8], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   16) { if (context.thread_index() >=   16) { if (!flg) { flg |= first1[context.thread_index() -   16]; val = binary_op(first2[context.thread_index() -   16], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   32) { if (context.thread_index() >=   32) { if (!flg) { flg |= first1[context.thread_index() -   32]; val = binary_op(first2[context.thread_index() -   32], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >   64) { if (context.thread_index() >=   64) { if (!flg) { flg |= first1[context.thread_index() -   64]; val = binary_op(first2[context.thread_index() -   64], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  128) { if (context.thread_index() >=  128) { if (!flg) { flg |= first1[context.thread_index() -  128]; val = binary_op(first2[context.thread_index() -  128], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  256) { if (context.thread_index() >=  256) { if (!flg) { flg |= first1[context.thread_index() -  256]; val = binary_op(first2[context.thread_index() -  256], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size >  512) { if (context.thread_index() >=  512) { if (!flg) { flg |= first1[context.thread_index() -  512]; val = binary_op(first2[context.thread_index() -  512], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+  if(block_size > 1024) { if (context.thread_index() >= 1024) { if (!flg) { flg |= first1[context.thread_index() - 1024]; val = binary_op(first2[context.thread_index() - 1024], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
+} // end inclusive_scan_by_flag()
+
+
+template<typename Context,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename Size,
+         typename BinaryFunction>
+__device__ __thrust_forceinline__
+void inclusive_scan_by_flag_n(Context context,
+                              InputIterator1 first1,
+                              InputIterator2 first2,
+                              Size n,
+                              BinaryFunction binary_op)
+{
+  // TODO support n > context.block_dimension()
+  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
+  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
+  
+  for (unsigned int i = 1; i < n; i <<= 1)
+  {
+    if (context.thread_index() < n && context.thread_index() >= i) 
+    {
+      if (!flg)
+      { 
+        flg |= first1[context.thread_index() - i];
+        val  = binary_op(first2[context.thread_index() - i], val);
+      }
+    }
+
+    context.barrier();
+    
+    first1[context.thread_index()] = flg;
+    first2[context.thread_index()] = val;
+    
+    context.barrier();
+  }
+} // end inclusive_scan_by_flag()
+
+
+template<typename Context, typename RandomAccessIterator, typename BinaryFunction>
+__device__ __thrust_forceinline__
+void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first, BinaryFunction op)
+{
+  typename thrust::iterator_value<RandomAccessIterator>::type x = first[ctx.thread_index()];
+
+  for(unsigned int offset = 1; offset < ctx.block_dimension(); offset *= 2)
+  {
+    if(ctx.thread_index() >= offset)
+    {
+      x = op(first[ctx.thread_index() - offset], x);
+    }
+
+    ctx.barrier();
+
+    first[ctx.thread_index()] = x;
+
+    ctx.barrier();
+  }
+}
+
+
+template<typename Context, typename RandomAccessIterator>
+__device__ __thrust_forceinline__
+void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first)
+{
+  block::inplace_inclusive_scan(ctx, first, thrust::plus<typename thrust::iterator_value<RandomAccessIterator>::type>());
+}
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/merge.h b/compat/thrust/system/cuda/detail/block/merge.h
new file mode 100644
index 0000000..9af0b7b
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/merge.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename StrictWeakOrdering>
+__device__ __thrust_forceinline__
+  RandomAccessIterator3 merge(Context context,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              RandomAccessIterator2 last2,
+                              RandomAccessIterator3 result,
+                              StrictWeakOrdering comp);
+
+// XXX assumes that context.block_dimension() <= n1 and
+//                  context.block_dimension() <= n2
+// This algorithm is analogous to inplace_merge
+// but instead of working on the ranges
+// [first, middle) and [middle, last)
+// it works on the ranges
+// [first, first + n1) and [first + n1, first + n1 + n2)
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size1,
+         typename Size2,
+         typename StrictWeakOrdering>
+__device__ __thrust_forceinline__
+  void inplace_merge_by_key_n(Context context,
+                              RandomAccessIterator1 keys_first,
+                              RandomAccessIterator2 values_first,
+                              Size1 n1,
+                              Size2 n2,
+                              StrictWeakOrdering comp);
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/block/merge.inl>
+
diff --git a/compat/thrust/system/cuda/detail/block/merge.inl b/compat/thrust/system/cuda/detail/block/merge.inl
new file mode 100644
index 0000000..5eae2b5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/merge.inl
@@ -0,0 +1,168 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/system/detail/generic/scalar/binary_search.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename StrictWeakOrdering>
+__device__ __thrust_forceinline__
+  RandomAccessIterator3 merge(Context context,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              RandomAccessIterator2 last2,
+                              RandomAccessIterator3 result,
+                              StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
+  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
+
+  difference1 n1 = last1 - first1;
+  difference2 n2 = last2 - first2;
+
+  // find the rank of each element in the other array
+  difference2 rank2 = 0;
+  if(context.thread_index() < n1)
+  {
+    RandomAccessIterator1 x = first1;
+    x += context.thread_index();
+
+    // lower_bound ensures that x sorts before any equivalent element of input2
+    // this ensures stability
+    rank2 = thrust::system::detail::generic::scalar::lower_bound(first2, last2, raw_reference_cast(*x), comp) - first2;
+  } // end if
+
+  difference1 rank1 = 0;
+  if(context.thread_index() < n2)
+  {
+    RandomAccessIterator2 x = first2 + context.thread_index();
+
+    // upper_bound ensures that x sorts before any equivalent element of input1
+    // this ensures stability
+    rank1 = thrust::system::detail::generic::scalar::upper_bound(first1, last1, raw_reference_cast(*x), comp) - first1;
+  } // end if
+
+  if(context.thread_index() < n1)
+  {
+    // scatter each element from input1
+    RandomAccessIterator1 src = first1 + context.thread_index();
+    RandomAccessIterator3 dst = result + context.thread_index() + rank2;
+
+    *dst = *src;
+  }
+
+  if(context.thread_index() < n2)
+  {
+    // scatter each element from input2
+    RandomAccessIterator2 src = first2 + context.thread_index();
+    RandomAccessIterator3 dst = result + context.thread_index() + rank1;
+
+    *dst = *src;
+  }
+
+  return result + n1 + n2;
+} // end merge
+
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size1,
+         typename Size2,
+         typename StrictWeakOrdering>
+__device__ __thrust_forceinline__
+  void inplace_merge_by_key_n(Context context,
+                              RandomAccessIterator1 keys_first,
+                              RandomAccessIterator2 values_first,
+                              Size1 n1,
+                              Size2 n2,
+                              StrictWeakOrdering comp)
+{
+  RandomAccessIterator1 input1 = keys_first;
+  RandomAccessIterator1 input2 = keys_first + n1;
+
+  RandomAccessIterator2 input1val = values_first;
+  RandomAccessIterator2 input2val = values_first + n1;
+  
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
+
+  // XXX use uninitialized here
+  KeyType inp1 = input1[context.thread_index()]; ValueType inp1val = input1val[context.thread_index()];
+  KeyType inp2 = input2[context.thread_index()]; ValueType inp2val = input2val[context.thread_index()];
+  
+  // to merge input1 and input2, use binary search to find the rank of inp1 & inp2 in arrays input2 & input1, respectively
+  // as before, the "end" variables point to one element after the last element of the arrays
+  
+  // start by looking through input2 for inp1's rank
+  unsigned int start_1 = 0;
+  
+  // don't do the search if our value is beyond the end of input1
+  if(context.thread_index() < n1)
+  {
+    start_1 = thrust::system::detail::generic::scalar::lower_bound_n(input2, n2, inp1, comp) - input2;
+  } // end if
+  
+  // now look through input1 for inp2's rank
+  unsigned int start_2 = 0;
+  
+  // don't do the search if our value is beyond the end of input2
+  if(context.thread_index() < n2)
+  {
+    // upper_bound ensures that equivalent elements in the first range sort before the second
+    start_2 = thrust::system::detail::generic::scalar::upper_bound_n(input1, n1, inp2, comp) - input1;
+  } // end if
+
+  context.barrier();
+  
+  // Write back into the right position to the input arrays; can be done in place since we read in
+  // the input arrays into registers before.
+  if(context.thread_index() < n1)
+  {
+    input1[start_1 + context.thread_index()] = inp1;
+    input1val[start_1 + context.thread_index()] = inp1val;
+  } // end if
+  
+  if(context.thread_index() < n2)
+  {
+    input1[start_2 + context.thread_index()] = inp2;
+    input1val[start_2 + context.thread_index()] = inp2val;
+  } // end if
+} // end inplace_merge_by_key_n()
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/merging_sort.h b/compat/thrust/system/cuda/detail/block/merging_sort.h
new file mode 100644
index 0000000..8f8f999
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/merging_sort.h
@@ -0,0 +1,199 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file merging_sort.h
+ *  \brief Block version of merge sort
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/generic/scalar/binary_search.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
+__device__ void conditional_swap(RandomAccessIterator1 keys_first,
+                                 RandomAccessIterator2 values_first,
+                                 const unsigned int i,
+                                 const unsigned int end,
+                                 bool pred,
+                                 Compare comp)
+{
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
+
+  if(pred && i+1<end)
+  {
+    KeyType xi = keys_first[i];
+    KeyType xj = keys_first[i+1];
+
+    // swap if xj sorts before xi
+    if(comp(xj, xi))
+    {
+      // XXX this implementation should really dispatch swap via ADL
+      ValueType yi;
+      yi = values_first[i];
+      ValueType yj;
+      yj = values_first[i+1];
+
+      keys_first[i]     = xj;
+      keys_first[i+1]   = xi;
+      values_first[i]   = yj;
+      values_first[i+1] = yi;
+    }
+  }
+}
+
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Compare>
+__device__ void transposition_sort(Context context,
+                                   RandomAccessIterator1 keys_first,
+                                   RandomAccessIterator2 values_first,
+                                   const unsigned int i,
+                                   const unsigned int end,
+                                   const unsigned int size,
+                                   Compare comp)
+{
+  const bool is_odd = i&0x1;
+  
+  for(unsigned int round=size/2; round>0; --round)
+  {
+    // ODDS
+    conditional_swap(keys_first, values_first, i, end, is_odd, comp);
+    context.barrier();
+  
+    // EVENS
+    conditional_swap(keys_first, values_first, i, end, !is_odd, comp);
+    context.barrier();
+  }
+}
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__device__ void merge(Context context,
+                      RandomAccessIterator1 keys_first, 
+                      RandomAccessIterator2 values_first,
+                      const unsigned int i,
+                      const unsigned int n,
+                      unsigned int begin,
+                      unsigned int end,
+                      unsigned int h,
+                      StrictWeakOrdering cmp)
+{
+  // INVARIANT: Every element i resides within a sequence [begin,end)
+  //            of length h which is already sorted
+  while( h<n )
+  {
+    h *= 2;
+
+    unsigned int new_begin = i&(~(h-1));
+    unsigned int new_end   = min(n,new_begin+h);
+
+    typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+    typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
+
+    KeyType key;
+    ValueType value;
+
+    unsigned int rank = i - begin;
+
+    // prevent out-of-bounds access
+    if(i < new_end)
+    {
+      key = keys_first[i];
+
+      if(begin==new_begin)  // in the left side of merging pair
+      {
+        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);
+        rank += (result - (keys_first+end));
+      }
+      else                  // in the right side of merging pair
+      {
+        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);
+        rank += (result - (keys_first+new_begin));
+      }
+
+      value = values_first[i];
+    }
+
+    context.barrier();
+
+    if(i < new_end)
+    {
+      keys_first[new_begin+rank] = key;
+      values_first[new_begin+rank] = value;
+    }
+    
+    context.barrier();
+
+    begin = new_begin;
+    end   = new_end;
+  }
+}
+
+
+/*! Block-wise implementation of merge sort.
+ *  It provides the same external interface as odd_even_sort.
+ */
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+__device__ void merging_sort(Context context,
+                             RandomAccessIterator1 keys_first,
+                             RandomAccessIterator2 values_first,
+                             const unsigned int n,
+                             StrictWeakOrdering comp)
+{
+  // Phase 1: Sort subsequences of length 32 using odd-even
+  //          transposition sort.  The code below assumes that h is a
+  //          power of 2.  Empirically, 32 delivers best results,
+  //          which is not surprising since that's the warp width.
+  unsigned int i = context.thread_index();
+  unsigned int h = 32;
+  unsigned int begin=i&(~(h-1)),  end=min(n,begin+h);
+  
+  transposition_sort(context, keys_first, values_first, i, end, h, comp);
+  
+  // Phase 2: Apply merge tree to produce final sorted results
+  merge(context, keys_first, values_first, i, n, begin, end, h, comp);
+} // end merging_sort()
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/odd_even_sort.h b/compat/thrust/system/cuda/detail/block/odd_even_sort.h
new file mode 100644
index 0000000..0fa0ea0
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/odd_even_sort.h
@@ -0,0 +1,151 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file odd_even_sort.h
+ *  \brief Block versions of Batcher's Odd-Even Merge Sort
+ */
+
+#pragma once
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+
+/*! Block-wise implementation of Batcher's Odd-Even Merge Sort
+ *  This implementation is based on Nadathur Satish's.
+ */
+template<typename KeyType,
+         typename ValueType,
+         typename StrictWeakOrdering>
+  __device__ void odd_even_sort(KeyType *keys,
+                                ValueType *data,
+                                const unsigned int n,
+                                StrictWeakOrdering comp)
+{
+  for(unsigned int p = blockDim.x>>1; p > 0; p >>= 1)
+  {
+    unsigned int q = blockDim.x>>1, r = 0, d = p;
+
+    while(q >= p)
+    {
+      unsigned int j = threadIdx.x + d;
+
+      // if j lies beyond the end of the array, we consider it "sorted" wrt i
+      // regardless of whether i lies beyond the end of the array 
+      if(threadIdx.x < (blockDim.x-d) && (threadIdx.x & p) == r && j < n)
+      {
+        KeyType xikey = keys[threadIdx.x];
+        KeyType xjkey = keys[j];
+
+        ValueType xivalue = data[threadIdx.x];
+        ValueType xjvalue = data[j];
+
+        // does xj sort before xi?
+        if(comp(xjkey, xikey))
+        {
+          keys[threadIdx.x] = xjkey;
+          keys[j] = xikey;
+
+          data[threadIdx.x] = xjvalue;
+          data[j] = xivalue;
+        } // end if
+      } // end if
+
+      d = q - p;
+      q >>= 1;
+      r = p;
+
+      __syncthreads();
+    } // end while
+  } // end for p
+} // end odd_even_sort()
+
+template<typename KeyType,
+         typename ValueType,
+         typename StrictWeakOrdering>
+  __device__ void stable_odd_even_sort(KeyType *keys,
+                                       ValueType *data,
+                                       const unsigned int n,
+                                       StrictWeakOrdering comp)
+{
+  for(unsigned int i = 0;
+      i < blockDim.x>>1;
+      ++i)
+  {
+    bool thread_is_odd = threadIdx.x & 0x1;
+
+    // do odds first
+    if(thread_is_odd && threadIdx.x + 1 < n)
+    {
+      KeyType xikey = keys[threadIdx.x];
+      KeyType xjkey = keys[threadIdx.x + 1];
+
+      ValueType xivalue = data[threadIdx.x];
+      ValueType xjvalue = data[threadIdx.x + 1];
+
+      // does xj sort before xi?
+      if(comp(xjkey, xikey))
+      {
+        keys[threadIdx.x] = xjkey;
+        keys[threadIdx.x + 1] = xikey;
+
+        data[threadIdx.x] = xjvalue;
+        data[threadIdx.x + 1] = xivalue;
+      } // end if
+    } // end if
+
+    __syncthreads();
+
+    // do evens second
+    if(!thread_is_odd && threadIdx.x + 1 < n)
+    {
+      KeyType xikey = keys[threadIdx.x];
+      KeyType xjkey = keys[threadIdx.x + 1];
+
+      ValueType xivalue = data[threadIdx.x];
+      ValueType xjvalue = data[threadIdx.x + 1];
+
+      // does xj sort before xi?
+      if(comp(xjkey, xikey))
+      {
+        keys[threadIdx.x] = xjkey;
+        keys[threadIdx.x + 1] = xikey;
+
+        data[threadIdx.x] = xjvalue;
+        data[threadIdx.x + 1] = xivalue;
+      } // end if
+    } // end if
+
+    __syncthreads();
+  } // end for i
+} // end stable_odd_even_sort()
+
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/block/reduce.h b/compat/thrust/system/cuda/detail/block/reduce.h
new file mode 100644
index 0000000..e0a1901
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/block/reduce.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace block
+{
+
+/* Reduces [data, data + n) using binary_op and stores the result in data[0]
+ *
+ * Upon return the elements in [data + 1, data + n) have unspecified values.
+ */
+template <typename Context, typename ValueIterator, typename BinaryFunction>
+__device__ __thrust_forceinline__
+void reduce_n(Context context, ValueIterator data, unsigned int n, BinaryFunction binary_op)
+{
+  if (context.block_dimension() < n)
+  {
+    for (unsigned int i = context.block_dimension() + context.thread_index(); i < n; i += context.block_dimension())
+      data[context.thread_index()] = binary_op(data[context.thread_index()], data[i]);
+
+    context.barrier();
+  }
+
+  while (n > 1)
+  {
+    unsigned int half = n / 2;
+
+    if (context.thread_index() < half)
+      data[context.thread_index()] = binary_op(data[context.thread_index()], data[n - context.thread_index() - 1]);
+
+    context.barrier();
+
+    n = n - half;
+  }
+}
+
+} // end namespace block
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/copy.h b/compat/thrust/system/cuda/detail/copy.h
new file mode 100644
index 0000000..8f7ee97
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(cross_system<System1,System2> exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(cross_system<System1,System2> exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/copy.inl>
+
diff --git a/compat/thrust/system/cuda/detail/copy.inl b/compat/thrust/system/cuda/detail/copy.inl
new file mode 100644
index 0000000..125eebd
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy.inl
@@ -0,0 +1,88 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/system/cuda/detail/copy_device_to_device.h>
+#include <thrust/system/cuda/detail/copy_cross_system.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename System,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<System> &system,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  return thrust::system::cuda::detail::copy_device_to_device(system,first,last,result);
+} // end copy()
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(cross_system<System1,System2> systems,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  return thrust::system::cuda::detail::copy_cross_system(systems,first,last,result);
+} // end copy()
+
+
+template<typename System,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<System> &system,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  return thrust::system::cuda::detail::copy_device_to_device(system,first,first+n,result);
+} // end copy_n()
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(cross_system<System1,System2> systems,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  return thrust::system::cuda::detail::copy_cross_system_n(systems,first,n,result);
+} // end copy_n()
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/copy_cross_system.h b/compat/thrust/system/cuda/detail/copy_cross_system.h
new file mode 100644
index 0000000..f68ea3c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_cross_system.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
+                                   InputIterator begin, 
+                                   InputIterator end, 
+                                   OutputIterator result);
+
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
+                                     InputIterator begin, 
+                                     Size n, 
+                                     OutputIterator result);
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/copy_cross_system.inl>
+
diff --git a/compat/thrust/system/cuda/detail/copy_cross_system.inl b/compat/thrust/system/cuda/detail/copy_cross_system.inl
new file mode 100644
index 0000000..861cb2c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_cross_system.inl
@@ -0,0 +1,301 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/copy_cross_system.h>
+#include <thrust/detail/copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/dispatch/is_trivial_copy.h>
+#include <thrust/system/cuda/detail/trivial_copy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// XXX WAR circular #inclusion problem
+template<typename,typename> class temporary_array;
+
+} // end detail
+
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+// general input to random access case
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename RandomAccessIterator>
+  RandomAccessIterator copy_cross_system(cross_system<System1,System2> systems,
+                                         InputIterator begin,
+                                         InputIterator end,
+                                         RandomAccessIterator result,
+                                         thrust::incrementable_traversal_tag, 
+                                         thrust::random_access_traversal_tag)
+{
+  //std::cerr << std::endl;
+  //std::cerr << "general copy_host_to_device(): InputIterator: " << typeid(InputIterator).name() << std::endl;
+  //std::cerr << "general copy_host_to_device(): OutputIterator: " << typeid(OutputIterator).name() << std::endl;
+
+  typedef typename thrust::iterator_value<InputIterator>::type InputType;
+
+  // allocate temporary storage in System1
+  thrust::detail::temporary_array<InputType, System1> temp(systems.system1,begin,end);
+  return thrust::copy(systems, temp.begin(), temp.end(), result);
+}
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename RandomAccessIterator>
+  RandomAccessIterator copy_cross_system_n(cross_system<System1,System2> systems,
+                                           InputIterator first,
+                                           Size n,
+                                           RandomAccessIterator result,
+                                           thrust::incrementable_traversal_tag, 
+                                           thrust::random_access_traversal_tag)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type InputType;
+
+  // allocate and copy to temporary storage System1
+  thrust::detail::temporary_array<InputType, System1> temp(systems.system1, first, n);
+
+  // recurse
+  return copy_cross_system(systems, temp.begin(), temp.end(), result);
+}
+
+
+// random access to general output case
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
+                                   RandomAccessIterator begin,
+                                   RandomAccessIterator end,
+                                   OutputIterator result,
+                                   thrust::random_access_traversal_tag, 
+                                   thrust::incrementable_traversal_tag)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
+
+  // copy to temporary storage in System2
+  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, begin, end);
+
+  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
+}
+
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
+                                     RandomAccessIterator first,
+                                     Size n,
+                                     OutputIterator result,
+                                     thrust::random_access_traversal_tag, 
+                                     thrust::incrementable_traversal_tag)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
+
+  // copy to temporary storage in System2
+  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, first, n);
+
+  // copy temp to result
+  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
+}
+
+
+// trivial copy
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
+                                          RandomAccessIterator1 begin,
+                                          RandomAccessIterator1 end,
+                                          RandomAccessIterator2 result,
+                                          thrust::random_access_traversal_tag,
+                                          thrust::random_access_traversal_tag,
+                                          thrust::detail::true_type) // trivial copy
+{
+//  std::cerr << std::endl;
+//  std::cerr << "random access copy_device_to_host(): trivial" << std::endl;
+//  std::cerr << "general copy_device_to_host(): RandomAccessIterator1: " << typeid(RandomAccessIterator1).name() << std::endl;
+//  std::cerr << "general copy_device_to_host(): RandomAccessIterator2: " << typeid(RandomAccessIterator2).name() << std::endl;
+  
+  // how many elements to copy?
+  typename thrust::iterator_traits<RandomAccessIterator1>::difference_type n = end - begin;
+
+  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, result);
+
+  return result + n;
+}
+
+
+namespace detail
+{
+
+// random access non-trivial iterator to random access iterator
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
+                                                                    RandomAccessIterator1 begin,
+                                                                    RandomAccessIterator1 end,
+                                                                    RandomAccessIterator2 result,
+                                                                    thrust::detail::false_type) // InputIterator is non-trivial
+{
+  // copy the input to a temporary input system buffer of OutputType
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type OutputType;
+
+  // allocate temporary storage in System1
+  thrust::detail::temporary_array<OutputType,System1> temp(systems.system1, begin, end);
+
+  // recurse
+  return copy_cross_system(systems, temp.begin(), temp.end(), result);
+}
+
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
+                                                                    RandomAccessIterator1 begin,
+                                                                    RandomAccessIterator1 end,
+                                                                    RandomAccessIterator2 result,
+                                                                    thrust::detail::true_type) // InputIterator is trivial
+{
+  typename thrust::iterator_difference<RandomAccessIterator1>::type n = thrust::distance(begin, end);
+
+  // allocate temporary storage in System2
+  // retain the input's type for the intermediate storage
+  // do not initialize the storage (the 0 does this)
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type InputType;
+  thrust::detail::temporary_array<InputType,System2> temp(0, systems.system2, n);
+
+  // force a trivial (memcpy) copy of the input to the temporary
+  // note that this will not correctly account for copy constructors
+  // but there's nothing we can do about that
+  // XXX one thing we might try is to use pinned memory for the temporary storage
+  //     this might allow us to correctly account for copy constructors
+  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, temp.begin());
+
+  // finally, copy to the result
+  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
+}
+
+} // end detail
+
+
+// random access iterator to random access host iterator with non-trivial copy
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
+                                          RandomAccessIterator1 begin,
+                                          RandomAccessIterator1 end,
+                                          RandomAccessIterator2 result,
+                                          thrust::random_access_traversal_tag,
+                                          thrust::random_access_traversal_tag,
+                                          thrust::detail::false_type) // is_trivial_copy
+{
+  // dispatch a non-trivial random access cross system copy based on whether or not the InputIterator is trivial
+  return detail::non_trivial_random_access_copy_cross_system(systems, begin, end, result,
+      typename thrust::detail::is_trivial_iterator<RandomAccessIterator1>::type());
+}
+
+// random access iterator to random access iterator
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
+                                          RandomAccessIterator1 begin,
+                                          RandomAccessIterator1 end,
+                                          RandomAccessIterator2 result,
+                                          thrust::random_access_traversal_tag input_traversal,
+                                          thrust::random_access_traversal_tag output_traversal)
+{
+  // dispatch on whether this is a trivial copy
+  return copy_cross_system(systems, begin, end, result, input_traversal, output_traversal,
+          typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type());
+}
+
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2>
+  RandomAccessIterator2 copy_cross_system_n(cross_system<System1,System2> systems,
+                                            RandomAccessIterator1 first,
+                                            Size n,
+                                            RandomAccessIterator2 result,
+                                            thrust::random_access_traversal_tag input_traversal,
+                                            thrust::random_access_traversal_tag output_traversal)
+{
+  // implement with copy_cross_system
+  return copy_cross_system(systems, first, first + n, result, input_traversal, output_traversal);
+}
+
+/////////////////
+// Entry Point //
+/////////////////
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
+                                   InputIterator begin, 
+                                   InputIterator end, 
+                                   OutputIterator result)
+{
+  return copy_cross_system(systems, begin, end, result, 
+          typename thrust::iterator_traversal<InputIterator>::type(),
+          typename thrust::iterator_traversal<OutputIterator>::type());
+}
+
+template<typename System1,
+         typename System2,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
+                                     InputIterator begin, 
+                                     Size n, 
+                                     OutputIterator result)
+{
+  return copy_cross_system_n(systems, begin, n, result, 
+          typename thrust::iterator_traversal<InputIterator>::type(),
+          typename thrust::iterator_traversal<OutputIterator>::type());
+}
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/copy_device_to_device.h b/compat/thrust/system/cuda/detail/copy_device_to_device.h
new file mode 100644
index 0000000..a7d8df8
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_device_to_device.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file copy_device_to_device.h
+ *  \brief Device implementations for copying on the device.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
+                                       InputIterator begin, 
+                                       InputIterator end, 
+                                       OutputIterator result);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/copy_device_to_device.inl>
+
diff --git a/compat/thrust/system/cuda/detail/copy_device_to_device.inl b/compat/thrust/system/cuda/detail/copy_device_to_device.inl
new file mode 100644
index 0000000..c8263c5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_device_to_device.inl
@@ -0,0 +1,127 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/copy_device_to_device.h>
+#include <thrust/system/cuda/detail/copy_cross_system.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/transform.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/trivial_copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
+                                       InputIterator begin, 
+                                       InputIterator end, 
+                                       OutputIterator result,
+                                       thrust::detail::false_type)
+{
+    // general case (mixed types)
+    typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+    return thrust::transform(exec, begin, end, result, thrust::identity<InputType>());
+#else
+    // we're not compiling with nvcc: copy [begin, end) to temp host memory
+    typename thrust::iterator_traits<InputIterator>::difference_type n = thrust::distance(begin, end);
+
+    thrust::host_system_tag temp_exec;
+    thrust::detail::temporary_array<InputType, thrust::host_system_tag> temp1(temp_exec, begin, end);
+
+    // transform temp1 to OutputType in host memory
+    typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+    thrust::detail::temporary_array<OutputType, thrust::host_system_tag> temp2(temp_exec, temp1.begin(), temp1.end());
+
+    // copy temp2 to device
+    result = thrust::system::cuda::detail::copy_cross_system(temp2.begin(), temp2.end(), result);
+
+    return result;
+#endif // THRUST_DEVICE_COMPILER_NVCC
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
+                                       InputIterator begin, 
+                                       InputIterator end, 
+                                       OutputIterator result,
+                                       thrust::detail::true_type)
+{
+    // specialization for device to device when the value_types match, operator= is not overloaded,
+    // and the iterators are pointers
+
+    // how many elements to copy?
+    typename thrust::iterator_traits<OutputIterator>::difference_type n = end - begin;
+
+    thrust::system::cuda::detail::trivial_copy_n(exec, begin, n, result);
+
+    return result + n;
+}
+
+} // end namespace detail
+
+/////////////////
+// Entry Point //
+/////////////////
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
+                                       InputIterator begin, 
+                                       InputIterator end, 
+                                       OutputIterator result)
+{
+    typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+    typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+    const bool use_trivial_copy = 
+        thrust::detail::is_same<InputType, OutputType>::value
+        && thrust::detail::is_trivial_iterator<InputIterator>::value 
+        && thrust::detail::is_trivial_iterator<OutputIterator>::value;
+
+    // XXX WAR unused variable warning
+    (void) use_trivial_copy;
+
+    return detail::copy_device_to_device(exec, begin, end, result,
+            thrust::detail::integral_constant<bool, use_trivial_copy>());
+
+}
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/copy_if.h b/compat/thrust/system/cuda/detail/copy_if.h
new file mode 100644
index 0000000..5ed0f6c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_if.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+   OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator result,
+                          Predicate pred);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/copy_if.inl>
+
diff --git a/compat/thrust/system/cuda/detail/copy_if.inl b/compat/thrust/system/cuda/detail/copy_if.inl
new file mode 100644
index 0000000..15ea7fa
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/copy_if.inl
@@ -0,0 +1,212 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/system/detail/internal/decompose.h>
+#include <thrust/scan.h>
+#include <thrust/system/cuda/detail/default_decomposition.h>
+#include <thrust/system/cuda/detail/reduce_intervals.h>
+#include <thrust/system/cuda/detail/block/inclusive_scan.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/functional.h>
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename Decomposition,
+          typename OutputIterator,
+          typename Context>
+struct copy_if_intervals_closure
+{
+  InputIterator1 input;
+  InputIterator2 stencil;
+  InputIterator3 offsets;
+  Decomposition decomp;
+  OutputIterator output;
+
+  typedef Context context_type;
+  context_type context;
+  
+  copy_if_intervals_closure(InputIterator1 input,
+                            InputIterator2 stencil,
+                            InputIterator3 offsets,
+                            Decomposition decomp,
+                            OutputIterator output,
+                            Context context = Context())
+    : input(input), stencil(stencil), offsets(offsets), decomp(decomp), output(output), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+   
+    typedef unsigned int PredicateType;
+    
+    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
+
+    thrust::plus<PredicateType> binary_op;
+
+    __shared__ PredicateType sdata[CTA_SIZE];  context.barrier();
+    
+    typedef typename Decomposition::index_type IndexType;
+
+    // this block processes results in [range.begin(), range.end())
+    thrust::system::detail::internal::index_range<IndexType> range = decomp[context.block_index()];
+
+    IndexType base = range.begin();
+
+    PredicateType predicate = 0;
+    
+    // advance input iterators to this thread's starting position
+    input   += base + context.thread_index();
+    stencil += base + context.thread_index();
+
+    // advance output to this interval's starting position
+    if (context.block_index() != 0)
+    {
+        InputIterator3 temp = offsets + (context.block_index() - 1);
+        output += *temp;
+    }
+
+    // process full blocks
+    while(base + CTA_SIZE <= range.end())
+    {
+        // read data
+        sdata[context.thread_index()] = predicate = *stencil;
+      
+        context.barrier();
+
+        // scan block
+        block::inclusive_scan(context, sdata, binary_op);
+       
+        // write data
+        if (predicate)
+        {
+            OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
+            *temp2 = *input;
+        }
+
+        // advance inputs by CTA_SIZE
+        base    += CTA_SIZE;
+        input   += CTA_SIZE;
+        stencil += CTA_SIZE;
+
+        // advance output by number of true predicates
+        output += sdata[CTA_SIZE - 1];
+
+        context.barrier();
+    }
+
+    // process partially full block at end of input (if necessary)
+    if (base < range.end())
+    {
+        // read data
+        if (base + context.thread_index() < range.end())
+            sdata[context.thread_index()] = predicate = *stencil;
+        else
+            sdata[context.thread_index()] = predicate = 0;
+       
+        context.barrier();
+
+        // scan block
+        block::inclusive_scan(context, sdata, binary_op);
+       
+        // write data
+        if (predicate) // expects predicate=false for >= interval_end
+        {
+            OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
+            *temp2 = *input;
+        }
+    }
+  }
+}; // copy_if_intervals_closure
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+   OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator output,
+                          Predicate pred)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type IndexType;
+  typedef typename thrust::iterator_value<OutputIterator>::type      OutputType;
+
+  if (first == last)
+      return output;
+
+  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
+  typedef thrust::detail::temporary_array<IndexType, DerivedPolicy>          IndexArray;
+
+  Decomposition decomp = default_decomposition(last - first);
+
+  // storage for per-block predicate counts
+  IndexArray block_results(exec, decomp.size());
+
+  // convert stencil into an iterator that produces integral values in {0,1}
+  typedef typename thrust::detail::predicate_to_integral<Predicate,IndexType>              PredicateToIndexTransform;
+  typedef thrust::transform_iterator<PredicateToIndexTransform, InputIterator2, IndexType> PredicateToIndexIterator;
+
+  PredicateToIndexIterator predicate_stencil(stencil, PredicateToIndexTransform(pred));
+
+  // compute number of true values in each interval
+  thrust::system::cuda::detail::reduce_intervals(exec, predicate_stencil, block_results.begin(), thrust::plus<IndexType>(), decomp);
+
+  // scan the partial sums
+  thrust::inclusive_scan(exec, block_results.begin(), block_results.end(), block_results.begin(), thrust::plus<IndexType>());
+
+  // copy values to output
+  const unsigned int ThreadsPerBlock = 256;
+  typedef typename IndexArray::iterator InputIterator3;
+  typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+  typedef copy_if_intervals_closure<InputIterator1,PredicateToIndexIterator,InputIterator3,Decomposition,OutputIterator,Context> Closure;
+  Closure closure(first, predicate_stencil, block_results.begin(), decomp, output);
+  detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+
+  return output + block_results[decomp.size() - 1];
+} // end copy_if()
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/compat/thrust/system/cuda/detail/count.h b/compat/thrust/system/cuda/detail/count.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/count.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/cuda_launch_config.h b/compat/thrust/system/cuda/detail/cuda_launch_config.h
new file mode 100644
index 0000000..b7f0ca2
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/cuda_launch_config.h
@@ -0,0 +1,384 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+// XXX define our own device_properties_t to avoid errors when #including
+//     this file in the absence of a CUDA installation
+struct device_properties_t
+{
+  // mirror the type and spelling of cudaDeviceProp's members
+  // keep these alphabetized
+  int    major;
+  int    maxGridSize[3];
+  int    maxThreadsPerBlock;
+  int    maxThreadsPerMultiProcessor;
+  int    minor;
+  int    multiProcessorCount;
+  int    regsPerBlock;
+  size_t sharedMemPerBlock;
+  int    warpSize;
+};
+
+
+// XXX define our own device_properties_t to avoid errors when #including
+//     this file in the absence of a CUDA installation
+struct function_attributes_t
+{
+  // mirror the type and spelling of cudaFuncAttributes' members
+  // keep these alphabetized
+  size_t constSizeBytes;
+  size_t localSizeBytes;
+  int    maxThreadsPerBlock;
+  int    numRegs;
+  size_t sharedSizeBytes;
+};
+
+
+/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
+ *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
+ *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
+ *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
+ *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
+ *          the "CUDA Occupancy Calculator". 
+ *  \note The __global__ function of interest is presumed to use 0 bytes of dynamically-allocated __shared__ memory.
+ */
+inline __host__ __device__
+std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
+                                                        const device_properties_t   &properties);
+
+/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
+ *  Use this version of the function when a CUDA block's dynamically-allocated __shared__ memory requirements
+ *  vary with the size of the block.
+ *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
+ *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
+ *  \param block_size_to_dynamic_smem_bytes A unary function which maps an integer CUDA block size to the number of bytes
+ *         of dynamically-allocated __shared__ memory required by a CUDA block of that size.
+ *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
+ *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
+ *          the "CUDA Occupancy Calculator". 
+ */
+template<typename UnaryFunction>
+inline __host__ __device__
+std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
+                                                        const device_properties_t   &properties,
+                                                        UnaryFunction block_size_to_dynamic_smem_size);
+
+
+/*! Returns the maximum amount of dynamic shared memory each block
+ *  can utilize without reducing thread occupancy.
+ *
+ *  \param properties CUDA device properties
+ *  \param attributes CUDA function attributes
+ *  \param blocks_per_processor Number of blocks per streaming multiprocessor
+ */
+inline __host__ __device__
+size_t proportional_smem_allocation(const device_properties_t   &properties,
+                                    const function_attributes_t &attributes,
+                                    size_t blocks_per_processor);
+
+
+template<typename UnaryFunction>
+inline __host__ __device__
+size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
+                                           const function_attributes_t &attributes,
+                                           UnaryFunction blocksize_to_dynamic_smem_usage);
+
+
+
+namespace cuda_launch_config_detail
+{
+
+using std::size_t;
+
+namespace util
+{
+
+
+template<typename T>
+inline __host__ __device__
+T min_(const T &lhs, const T &rhs)
+{
+  return rhs < lhs ? rhs : lhs;
+}
+
+
+template <typename T>
+struct zero_function
+{
+  inline __host__ __device__
+  T operator()(T)
+  {
+    return 0;
+  }
+};
+
+
+// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
+template<typename L, typename R>
+  inline __host__ __device__ L divide_ri(const L x, const R y)
+{
+    return (x + (y - 1)) / y;
+}
+
+// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
+template<typename L, typename R>
+  inline __host__ __device__ L divide_rz(const L x, const R y)
+{
+    return x / y;
+}
+
+// round x towards infinity to the next multiple of y
+template<typename L, typename R>
+  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
+
+// round x towards zero to the next multiple of y
+template<typename L, typename R>
+  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
+
+} // end namespace util
+
+
+
+// granularity of shared memory allocation
+inline __host__ __device__
+size_t smem_allocation_unit(const device_properties_t &properties)
+{
+  switch(properties.major)
+  {
+    case 1:  return 512;
+    case 2:  return 128;
+    case 3:  return 256;
+    default: return 256; // unknown GPU; have to guess
+  }
+}
+
+
+// granularity of register allocation
+inline __host__ __device__
+size_t reg_allocation_unit(const device_properties_t &properties, const size_t regsPerThread)
+{
+  switch(properties.major)
+  {
+    case 1:  return (properties.minor <= 1) ? 256 : 512;
+    case 2:  switch(regsPerThread)
+             {
+               case 21:
+               case 22:
+               case 29:
+               case 30:
+               case 37:
+               case 38:
+               case 45:
+               case 46:
+                 return 128;
+               default:
+                 return 64;
+             }
+    case 3:  return 256;
+    default: return 256; // unknown GPU; have to guess
+  }
+}
+
+
+// granularity of warp allocation
+inline __host__ __device__
+size_t warp_allocation_multiple(const device_properties_t &properties)
+{
+  return (properties.major <= 1) ? 2 : 1;
+}
+
+// number of "sides" into which the multiprocessor is partitioned
+inline __host__ __device__
+size_t num_sides_per_multiprocessor(const device_properties_t &properties)
+{
+  switch(properties.major)
+  {
+    case 1:  return 1;
+    case 2:  return 2;
+    case 3:  return 4;
+    default: return 4; // unknown GPU; have to guess
+  }
+}
+
+
+inline __host__ __device__
+size_t max_blocks_per_multiprocessor(const device_properties_t &properties)
+{
+  return (properties.major <= 2) ? 8 : 16;
+}
+
+
+inline __host__ __device__
+size_t max_active_blocks_per_multiprocessor(const device_properties_t    &properties,
+                                            const function_attributes_t  &attributes,
+                                            int CTA_SIZE,
+                                            size_t dynamic_smem_bytes)
+{
+  // Determine the maximum number of CTAs that can be run simultaneously per SM
+  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
+
+  //////////////////////////////////////////
+  // Limits due to threads/SM or blocks/SM
+  //////////////////////////////////////////
+  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;  // 768, 1024, 1536, etc.
+  const size_t maxBlocksPerSM  = max_blocks_per_multiprocessor(properties);
+
+  // Calc limits
+  const size_t ctaLimitThreads = (CTA_SIZE <= properties.maxThreadsPerBlock) ? maxThreadsPerSM / CTA_SIZE : 0;
+  const size_t ctaLimitBlocks  = maxBlocksPerSM;
+
+  //////////////////////////////////////////
+  // Limits due to shared memory/SM
+  //////////////////////////////////////////
+  const size_t smemAllocationUnit     = smem_allocation_unit(properties);
+  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;
+  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);
+
+  // Calc limit
+  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
+
+  //////////////////////////////////////////
+  // Limits due to registers/SM
+  //////////////////////////////////////////
+  const size_t regAllocationUnit      = reg_allocation_unit(properties, attributes.numRegs);
+  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
+  const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
+
+  // Calc limit
+  size_t ctaLimitRegs;
+  if(properties.major <= 1)
+  {
+    // GPUs of compute capability 1.x allocate registers to CTAs
+    // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
+    const size_t regsPerCTA = util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
+    ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
+  }
+  else
+  {
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    // Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit
+    const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
+    const size_t numSides = num_sides_per_multiprocessor(properties);
+    const size_t numRegsPerSide = properties.regsPerBlock / numSides;
+    ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
+  }
+
+  //////////////////////////////////////////
+  // Overall limit is min() of limits due to above reasons
+  //////////////////////////////////////////
+  return util::min_(ctaLimitRegs, util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));
+}
+
+
+} // end namespace cuda_launch_config_detail
+
+
+template<typename UnaryFunction>
+inline __host__ __device__
+std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
+                                                        const device_properties_t   &properties,
+                                                        UnaryFunction block_size_to_dynamic_smem_size)
+{
+  size_t max_occupancy      = properties.maxThreadsPerMultiProcessor;
+  size_t largest_blocksize  = cuda_launch_config_detail::util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
+  size_t granularity        = properties.warpSize;
+  size_t max_blocksize      = 0;
+  size_t highest_occupancy  = 0;
+
+  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
+  {
+    size_t occupancy = blocksize * cuda_launch_config_detail::max_active_blocks_per_multiprocessor(properties, attributes, blocksize, block_size_to_dynamic_smem_size(blocksize));
+
+    if(occupancy > highest_occupancy)
+    {
+      max_blocksize = blocksize;
+      highest_occupancy = occupancy;
+    }
+
+    // early out, can't do better
+    if(highest_occupancy == max_occupancy)
+      break;
+  }
+
+  return max_blocksize;
+}
+
+
+inline __host__ __device__
+std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
+                                                        const device_properties_t   &properties)
+{
+  return block_size_with_maximum_potential_occupancy(attributes, properties, cuda_launch_config_detail::util::zero_function<std::size_t>());
+}
+
+
+inline __host__ __device__
+size_t proportional_smem_allocation(const device_properties_t   &properties,
+                                    const function_attributes_t &attributes,
+                                    size_t blocks_per_processor)
+{
+  size_t smem_per_processor    = properties.sharedMemPerBlock;
+  size_t smem_allocation_unit  = cuda_launch_config_detail::smem_allocation_unit(properties);
+
+  size_t total_smem_per_block  = cuda_launch_config_detail::util::round_z(smem_per_processor / blocks_per_processor, smem_allocation_unit);
+  size_t static_smem_per_block = attributes.sharedSizeBytes;
+  
+  return total_smem_per_block - static_smem_per_block;
+}
+
+
+template<typename UnaryFunction>
+inline __host__ __device__
+size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
+                                           const function_attributes_t &attributes,
+                                           UnaryFunction blocksize_to_dynamic_smem_usage)
+{
+  size_t largest_blocksize = (thrust::min)(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
+  size_t granularity = properties.warpSize;
+  
+  for(int blocksize = largest_blocksize; blocksize > 0; blocksize -= granularity)
+  {
+    size_t total_smem_usage = blocksize_to_dynamic_smem_usage(blocksize) + attributes.sharedSizeBytes;
+
+    if(total_smem_usage <= properties.sharedMemPerBlock)
+    {
+      return blocksize;
+    }
+  }
+
+  return 0;
+}
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/default_decomposition.h b/compat/thrust/system/cuda/detail/default_decomposition.h
new file mode 100644
index 0000000..1ed6bcf
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/default_decomposition.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file default_decomposition.h
+ *  \brief Return a decomposition that is appropriate for the CUDA backend.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/internal/decompose.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/default_decomposition.inl>
+
diff --git a/compat/thrust/system/cuda/detail/default_decomposition.inl b/compat/thrust/system/cuda/detail/default_decomposition.inl
new file mode 100644
index 0000000..3f0879a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/default_decomposition.inl
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
+{
+  // TODO eliminate magical constant
+  device_properties_t properties = device_properties();
+  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, properties.maxThreadsPerBlock, 10 * properties.multiProcessorCount);
+}
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/alignment.h b/compat/thrust/system/cuda/detail/detail/alignment.h
new file mode 100644
index 0000000..31fdaaf
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/alignment.h
@@ -0,0 +1,223 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace alignment_of_detail
+{
+
+
+template<typename T> class alignment_of_impl;
+
+template<typename T, std::size_t size_diff>
+  struct helper
+{
+  static const std::size_t value = size_diff;
+};
+
+template<typename T>
+  class helper<T,0>
+{
+  public:
+    static const std::size_t value = alignment_of_impl<T>::value;
+};
+
+template<typename T>
+  class alignment_of_impl
+{
+  private:
+    struct big { T x; char c; };
+
+  public:
+    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
+};
+
+
+} // end alignment_of_detail
+
+
+template<typename T>
+  struct alignment_of
+    : alignment_of_detail::alignment_of_impl<T>
+{};
+
+
+template<std::size_t Align> struct aligned_type;
+
+// __align__ is CUDA-specific, so guard it
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+// implementing aligned_type portably is tricky:
+
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// implement aligned_type with specialization because MSVC
+// requires literals as arguments to declspec(align(n))
+template<> struct aligned_type<1>
+{
+  struct __align__(1) type { };
+};
+
+template<> struct aligned_type<2>
+{
+  struct __align__(2) type { };
+};
+
+template<> struct aligned_type<4>
+{
+  struct __align__(4) type { };
+};
+
+template<> struct aligned_type<8>
+{
+  struct __align__(8) type { };
+};
+
+template<> struct aligned_type<16>
+{
+  struct __align__(16) type { };
+};
+
+template<> struct aligned_type<32>
+{
+  struct __align__(32) type { };
+};
+
+template<> struct aligned_type<64>
+{
+  struct __align__(64) type { };
+};
+
+template<> struct aligned_type<128>
+{
+  struct __align__(128) type { };
+};
+
+template<> struct aligned_type<256>
+{
+  struct __align__(256) type { };
+};
+
+template<> struct aligned_type<512>
+{
+  struct __align__(512) type { };
+};
+
+template<> struct aligned_type<1024>
+{
+  struct __align__(1024) type { };
+};
+
+template<> struct aligned_type<2048>
+{
+  struct __align__(2048) type { };
+};
+
+template<> struct aligned_type<4096>
+{
+  struct __align__(4096) type { };
+};
+
+template<> struct aligned_type<8192>
+{
+  struct __align__(8192) type { };
+};
+#  elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
+// implement aligned_type with specialization because gcc 4.2
+// requires literals as arguments to __attribute__(aligned(n))
+template<> struct aligned_type<1>
+{
+  struct __align__(1) type { };
+};
+
+template<> struct aligned_type<2>
+{
+  struct __align__(2) type { };
+};
+
+template<> struct aligned_type<4>
+{
+  struct __align__(4) type { };
+};
+
+template<> struct aligned_type<8>
+{
+  struct __align__(8) type { };
+};
+
+template<> struct aligned_type<16>
+{
+  struct __align__(16) type { };
+};
+
+template<> struct aligned_type<32>
+{
+  struct __align__(32) type { };
+};
+
+template<> struct aligned_type<64>
+{
+  struct __align__(64) type { };
+};
+
+template<> struct aligned_type<128>
+{
+  struct __align__(128) type { };
+};
+
+#  else
+// assume the compiler allows template parameters as
+// arguments to __align__ 
+template<std::size_t Align> struct aligned_type
+{
+  struct __align__(Align) type { };
+};
+#  endif // THRUST_HOST_COMPILER
+#else
+template<std::size_t Align> struct aligned_type
+{
+  struct type { };
+};
+#endif // THRUST_DEVICE_COMPILER
+
+
+template<std::size_t Len, std::size_t Align>
+  struct aligned_storage
+{
+  union type
+  {
+    unsigned char data[Len];
+
+    typename aligned_type<Align>::type align;
+  };
+};
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h b/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h
new file mode 100644
index 0000000..e2c5a44
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h
@@ -0,0 +1,284 @@
+/**
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ */
+
+
+//------------------------------------------------------------------------------
+// Common B40C Defines, Properties, and Routines 
+//------------------------------------------------------------------------------
+
+
+#pragma once
+
+#include <cuda.h>
+#include <thrust/system/cuda/detail/synchronize.h>
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+//------------------------------------------------------------------------------
+// Device properties 
+//------------------------------------------------------------------------------
+
+
+#ifndef __CUDA_ARCH__
+	#define __CUDA_ARCH__ 0
+#endif
+
+#define B40C_FERMI(version)								(version >= 200)
+#define B40C_LOG_WARP_THREADS							5									// 32 threads in a warp
+#define B40C_WARP_THREADS								(1 << B40C_LOG_WARP_THREADS)
+#define B40C_LOG_MEM_BANKS(version) 					((version >= 200) ? 5 : 4)			// 32 banks on fermi, 16 on tesla
+#define B40C_MEM_BANKS(version)							(1 << B40C_LOG_MEM_BANKS(version))
+
+// TODO refactor these
+#if __CUDA_ARCH__ >= 200
+	#define FastMul(a, b) (a * b)
+#else
+	#define FastMul(a, b) (__umul24(a, b))
+#endif	
+
+#if __CUDA_ARCH__ >= 120
+	#define WarpVoteAll(active_threads, predicate) (__all(predicate))
+#else 
+	#define WarpVoteAll(active_threads, predicate) (EmulatedWarpVoteAll<active_threads>(predicate))
+#endif
+
+#if __CUDA_ARCH__ >= 200
+	#define TallyWarpVote(active_threads, predicate, storage) (__popc(__ballot(predicate)))
+#else 
+	#define TallyWarpVote(active_threads, predicate, storage) (TallyWarpVoteSm10<active_threads>(predicate, storage))
+#endif
+
+#ifdef __LP64__
+	#define _B40C_LP64_ true
+#else
+	#define _B40C_LP64_ false
+#endif
+
+#define _B40C_REG_MISER_QUALIFIER_ __shared__
+
+
+//------------------------------------------------------------------------------
+// Handy routines 
+//------------------------------------------------------------------------------
+
+
+/**
+ * Select maximum
+ */
+#define B40C_MAX(a, b) ((a > b) ? a : b)
+
+
+/**
+ * MagnitudeShift().  Allows you to shift left for positive magnitude values, 
+ * right for negative.   
+ * 
+ * N.B. This code is a little strange; we are using this meta-programming 
+ * pattern of partial template specialization for structures in order to 
+ * decide whether to shift left or right.  Normally we would just use a 
+ * conditional to decide if something was negative or not and then shift 
+ * accordingly, knowing that the compiler will elide the untaken branch, 
+ * i.e., the out-of-bounds shift during dead code elimination. However, 
+ * the pass for bounds-checking shifts seems to happen before the DCE 
+ * phase, which results in a an unsightly number of compiler warnings, so 
+ * we force the issue earlier using structural template specialization.
+ */
+
+template <typename K, int magnitude, bool shift_left> struct MagnitudeShiftOp;
+
+template <typename K, int magnitude> 
+struct MagnitudeShiftOp<K, magnitude, true> {
+	__device__ __forceinline__ static K Shift(K key) {
+		return key << magnitude;
+	}
+};
+
+template <typename K, int magnitude> 
+struct MagnitudeShiftOp<K, magnitude, false> {
+	__device__ __forceinline__ static K Shift(K key) {
+		return key >> magnitude;
+	}
+};
+
+template <typename K, int magnitude> 
+__device__ __forceinline__ K MagnitudeShift(K key) {
+	return MagnitudeShiftOp<K, (magnitude > 0) ? magnitude : magnitude * -1, (magnitude > 0)>::Shift(key);
+}
+
+
+/**
+ * Supress warnings for unused constants
+ */
+template <typename T>
+__device__ __forceinline__ void SuppressUnusedConstantWarning(const T) {}
+
+
+
+
+//------------------------------------------------------------------------------
+// Common device routines
+//------------------------------------------------------------------------------
+
+
+/**
+ * Perform a warp-synchrounous prefix scan.  Allows for diverting a warp's
+ * threads into separate scan problems (multi-scan). 
+ */
+template <int NUM_ELEMENTS, bool MULTI_SCAN>
+__device__ __forceinline__ int WarpScan(
+	volatile int warpscan[][NUM_ELEMENTS],
+	int partial_reduction,
+	int copy_section) {
+	
+	int warpscan_idx;
+	if (MULTI_SCAN) {
+		warpscan_idx = threadIdx.x & (NUM_ELEMENTS - 1);
+	} else {
+		warpscan_idx = threadIdx.x;
+	}
+
+	warpscan[1][warpscan_idx] = partial_reduction;
+
+	if (NUM_ELEMENTS > 1) warpscan[1][warpscan_idx] = partial_reduction = 
+			partial_reduction + warpscan[1][warpscan_idx - 1];
+	if (NUM_ELEMENTS > 2) warpscan[1][warpscan_idx] = partial_reduction = 
+			partial_reduction + warpscan[1][warpscan_idx - 2];
+	if (NUM_ELEMENTS > 4) warpscan[1][warpscan_idx] = partial_reduction = 
+			partial_reduction + warpscan[1][warpscan_idx - 4];
+	if (NUM_ELEMENTS > 8) warpscan[1][warpscan_idx] = partial_reduction = 
+			partial_reduction + warpscan[1][warpscan_idx - 8];
+	if (NUM_ELEMENTS > 16) warpscan[1][warpscan_idx] = partial_reduction = 
+			partial_reduction + warpscan[1][warpscan_idx - 16];
+	
+	if (copy_section > 0) {
+		warpscan[1 + copy_section][warpscan_idx] = partial_reduction;
+	}
+	
+	return warpscan[1][warpscan_idx - 1];
+}
+
+/**
+ * Perform a warp-synchronous reduction
+ */
+template <int NUM_ELEMENTS>
+__device__ __forceinline__ void WarpReduce(
+	int idx,
+	volatile int *storage,
+	int partial_reduction)
+{
+	storage[idx] = partial_reduction;
+
+	if (NUM_ELEMENTS > 16) storage[idx] = partial_reduction = partial_reduction + storage[idx + 16];
+	if (NUM_ELEMENTS > 8) storage[idx] = partial_reduction = partial_reduction + storage[idx + 8];
+	if (NUM_ELEMENTS > 4) storage[idx] = partial_reduction = partial_reduction + storage[idx + 4];
+	if (NUM_ELEMENTS > 2) storage[idx] = partial_reduction = partial_reduction + storage[idx + 2];
+	if (NUM_ELEMENTS > 1) storage[idx] = partial_reduction = partial_reduction + storage[idx + 1];
+}
+
+
+/**
+ * Tally a warp-vote regarding the given predicate using the supplied storage
+ */
+template <int ACTIVE_THREADS>
+__device__ __forceinline__ int TallyWarpVoteSm10(int predicate, int storage[]) {
+	WarpReduce<ACTIVE_THREADS>(threadIdx.x, storage, predicate);
+	return storage[0];
+}
+
+
+/**
+ * Tally a warp-vote regarding the given predicate
+ */
+template <int ACTIVE_THREADS>
+__device__ __forceinline__ int TallyWarpVoteSm10(int predicate) {
+  __shared__ int vote_reduction[B40C_WARP_THREADS];
+  return TallyWarpVoteSm10<ACTIVE_THREADS>(predicate, vote_reduction);
+}
+
+/**
+ * Emulate the __all() warp vote instruction
+ */
+template <int ACTIVE_THREADS>
+__device__ __forceinline__ int EmulatedWarpVoteAll(int predicate) {
+	return (TallyWarpVoteSm10<ACTIVE_THREADS>(predicate) == ACTIVE_THREADS);
+}
+
+
+/**
+ * Have each thread concurrently perform a serial reduction over its specified segment 
+ */
+template <int LENGTH>
+__device__ __forceinline__ int
+SerialReduce(int segment[]) {
+	
+	int reduce = segment[0];
+
+	#pragma unroll
+	for (int i = 1; i < (int) LENGTH; i++) {
+		reduce += segment[i];
+	}
+	
+	return reduce;
+}
+
+
+/**
+ * Have each thread concurrently perform a serial scan over its specified segment
+ */
+template <int LENGTH>
+__device__ __forceinline__
+void SerialScan(int segment[], int seed0) {
+	
+	int seed1;
+
+	#pragma unroll	
+	for (int i = 0; i < (int) LENGTH; i += 2) {
+		seed1 = segment[i] + seed0;
+		segment[i] = seed0;
+		seed0 = seed1 + segment[i + 1];
+		segment[i + 1] = seed1;
+	}
+}
+
+
+
+
+//------------------------------------------------------------------------------
+// Empty Kernels
+//------------------------------------------------------------------------------
+
+template <typename T>
+__global__ void FlushKernel(void)
+{
+}
+
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h
new file mode 100644
index 0000000..2b199bb
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h
@@ -0,0 +1,807 @@
+/******************************************************************************
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ ******************************************************************************/
+
+
+
+/******************************************************************************
+ * Radix Sorting API
+ *
+ * USAGE:
+ * 
+ * Using the B40C radix sorting implementation is easy.  Just #include this API 
+ * file and its kernel include dependencies within your source.  Below are two
+ * examples for using: 
+ *
+ * (1) A keys-only example for sorting floats:
+ * 
+ *		// Create storage-management structure
+ * 		RadixSortStorage<float> device_storage(d_float_keys);			
+ *
+ *		// Create and enact sorter
+ * 		RadixSortingEnactor sorter<float>(d_float_keys_len);
+ *		sorter.EnactSort(device_storage);
+ *
+ *		// Re-acquire pointer to sorted keys, free unused/temp storage 
+ *		d_float_keys = device_storage.d_keys;
+ *		device_storage.CleanupTempStorage();
+ *
+ * (2) And a key-value example for sorting ints paired with doubles:
+ *
+ *		// Create storage-management structure
+ * 		RadixSortStorage<int, double> device_storage(d_int_keys, d_double_values);			
+ *
+ *		// Create and enact sorter
+ * 		RadixSortingEnactor sorter<int, double>(d_int_keys_len);
+ *		sorter.EnactSort(device_storage);
+ *
+ *		// Re-acquire pointer to sorted keys and values, free unused/temp storage 
+ *		d_int_keys = device_storage.d_keys;
+ *		d_double_values = device_storage.d_values;
+ *		device_storage.CleanupTempStorage();
+ *
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <stdlib.h> 
+#include <stdio.h> 
+#include <string.h> 
+#include <math.h> 
+#include <float.h>
+
+#include "radixsort_reduction_kernel.h"
+#include "radixsort_spine_kernel.h"
+#include "radixsort_scanscatter_kernel.h"
+
+#include <thrust/swap.h>
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+
+/******************************************************************************
+ * Debugging options
+ ******************************************************************************/
+
+static bool RADIXSORT_DEBUG = false;
+
+
+
+/******************************************************************************
+ * Structures for mananging device-side sorting state
+ ******************************************************************************/
+
+/**
+ * Sorting storage-management structure for device vectors
+ */
+template <typename K, typename V = KeysOnlyType>
+struct RadixSortStorage {
+
+	// Device vector of keys to sort
+	K* d_keys;
+	
+	// Device vector of values to sort
+	V* d_values;
+
+	// Ancillary device vector for key storage 
+	K* d_alt_keys;
+
+	// Ancillary device vector for value storage
+	V* d_alt_values;
+
+	// Temporary device storage needed for radix sorting histograms
+	int *d_spine;
+	
+	// Flip-flopping temporary device storage denoting which digit place 
+	// pass should read from which input source (i.e., false if reading from 
+	// keys, true if reading from alternate_keys
+	bool *d_from_alt_storage;
+
+	// Host-side boolean whether or not an odd number of sorting passes left the 
+	// results in alternate storage.  If so, the d_keys (and d_values) pointers 
+	// will have been swapped with the d_alt_keys (and d_alt_values) pointers in order to 
+	// point to the final results.
+	bool using_alternate_storage;
+	
+	// Constructor
+	RadixSortStorage(K* keys = NULL, V* values = NULL) 
+	{ 
+		d_keys = keys; 
+		d_values = values; 
+		d_alt_keys = NULL; 
+		d_alt_values = NULL; 
+		d_spine = NULL;
+		d_from_alt_storage = NULL;
+		
+		using_alternate_storage = false;
+	}
+
+	// Clean up non-results storage (may include freeing original storage if 
+	// primary pointers were swizzled as per using_alternate_storage) 
+	cudaError_t CleanupTempStorage() 
+	{
+		if (d_alt_keys) cudaFree(d_alt_keys);
+		if (d_alt_values) cudaFree(d_alt_values);
+		if (d_spine) cudaFree(d_spine);
+		if (d_from_alt_storage) cudaFree(d_from_alt_storage);
+		
+		return cudaSuccess;
+	}
+};
+
+
+
+/******************************************************************************
+ * Base class for sorting enactors
+ ******************************************************************************/
+
+
+/**
+ * Base class for SRTS radix sorting enactors.
+ */
+template <typename K, typename V>
+class BaseRadixSortingEnactor 
+{
+public:
+	
+	// Unsigned integer type suitable for radix sorting of keys
+	typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
+
+protected:
+
+	//
+	// Information about our problem configuration
+	//
+	
+	bool				_keys_only;
+	unsigned int 		_num_elements;
+	int 				_cycle_elements;
+	int 				_spine_elements;
+	int 				_grid_size;
+	CtaDecomposition 	_work_decomposition;
+	int 				_passes;
+	bool 				_swizzle_pointers_for_odd_passes;
+
+	// Information about our target device
+	cudaDeviceProp 		_device_props;
+	int 				_device_sm_version;
+	
+	// Information about our kernel assembly
+	int 				_kernel_ptx_version;
+	cudaFuncAttributes 	_spine_scan_kernel_attrs;
+	
+protected:
+	
+	/**
+	 * Constructor.
+	 */
+	BaseRadixSortingEnactor(int passes, int radix_bits, unsigned int num_elements, int max_grid_size, bool swizzle_pointers_for_odd_passes = true); 
+	
+	/**
+	 * Heuristic for determining the number of CTAs to launch.
+	 *   
+	 * @param[in] 		max_grid_size  
+	 * 		Maximum allowable number of CTAs to launch.  A value of 0 indicates 
+	 * 		that the default value should be used.
+	 * 
+	 * @return The actual number of CTAs that should be launched
+	 */
+	int GridSize(int max_grid_size);
+
+	/**
+	 * Performs a distribution sorting pass over a single digit place
+	 */
+	template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
+	cudaError_t DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage); 
+	
+	/**
+	 * Enacts a sorting operation by performing the the appropriate 
+	 * digit-place passes.  To be overloaded by specialized subclasses.
+	 */
+	virtual cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage) = 0;
+	
+public:
+	
+	/**
+	 * Returns the length (in unsigned ints) of the device vector needed for  
+	 * temporary storage of the reduction spine.  Useful if pre-allocating 
+	 * your own device storage (as opposed to letting EnactSort() allocate it
+	 * for you).
+	 */
+	int SpineElements() { return _spine_elements; }
+
+	/**
+	 * Returns whether or not the problem will fit on the device.
+	 */
+	bool CanFit();
+
+	/**
+	 * Enacts a radix sorting operation on the specified device data.
+	 * 
+	 * IMPORTANT NOTES: The device storage backing the specified input vectors of 
+	 * keys (and data) will be modified.  (I.e., treat this as an in-place sort.)  
+	 * 
+	 * Additionally, the pointers in the problem_storage structure may be updated 
+	 * (a) depending upon the number of digit-place sorting passes needed, and (b) 
+	 * whether or not the caller has already allocated temporary storage.  
+	 * 
+	 * The sorted results will always be referenced by problem_storage.d_keys (and 
+	 * problem_storage.d_values).  However, for an odd number of sorting passes (uncommon)
+	 * these results will actually be backed by the storage initially allocated for 
+	 * by problem_storage.d_alt_keys (and problem_storage.d_alt_values).  If so, 
+	 * problem_storage.d_alt_keys and problem_storage.d_alt_keys will be updated to 
+	 * reference the original problem_storage.d_keys and problem_storage.d_values in order 
+	 * to facilitate cleanup.  
+	 * 
+	 * This means it is important to avoid keeping stale copies of device pointers 
+	 * to keys/data; you will want to re-reference the pointers in problem_storage.
+	 * 
+	 * @param[in/out] 	problem_storage 
+	 * 		Device vectors of keys and values to sort, and ancillary storage 
+	 * 		needed by the sorting kernels. See the IMPORTANT NOTES above. 
+	 * 
+	 * 		The problem_storage.[alternate_keys|alternate_values|d_spine] fields are 
+	 * 		temporary storage needed by the sorting kernels.  To facilitate 
+	 * 		speed, callers are welcome to re-use this storage for same-sized 
+	 * 		(or smaller) sortign problems. If NULL, these storage vectors will be 
+	 *      allocated by this routine (and must be subsequently cuda-freed by 
+	 *      the caller).
+	 *
+	 * @return cudaSuccess on success, error enumeration otherwise
+	 */
+	cudaError_t EnactSort(RadixSortStorage<K, V> &problem_storage);	
+
+    /*
+     * Destructor
+     */
+    virtual ~BaseRadixSortingEnactor() {}
+};
+
+
+
+template <typename K, typename V>
+BaseRadixSortingEnactor<K, V>::BaseRadixSortingEnactor(
+	int passes, 
+	int max_radix_bits, 
+	unsigned int num_elements, 
+	int max_grid_size,
+	bool swizzle_pointers_for_odd_passes) 
+{
+	//
+	// Get current device properties 
+	//
+
+	int current_device;
+	cudaGetDevice(&current_device);
+	cudaGetDeviceProperties(&_device_props, current_device);
+	_device_sm_version = _device_props.major * 100 + _device_props.minor * 10;
+
+	
+	//
+	// Get SM version of compiled kernel assembly
+	//
+	cudaFuncGetAttributes(&_spine_scan_kernel_attrs, SrtsScanSpine<void>);
+	_kernel_ptx_version = _spine_scan_kernel_attrs.ptxVersion * 10;
+	
+
+	//
+	// Determine number of CTAs to launch, shared memory, cycle elements, etc.
+	//
+
+	_passes								= passes;
+	_num_elements 						= num_elements;
+	_keys_only 							= IsKeysOnly<V>();
+	_cycle_elements 					= B40C_RADIXSORT_CYCLE_ELEMENTS(_kernel_ptx_version , ConvertedKeyType, V);
+	_grid_size 							= GridSize(max_grid_size);
+	_swizzle_pointers_for_odd_passes	= swizzle_pointers_for_odd_passes;
+	
+	int total_cycles 			= _num_elements / _cycle_elements;
+	unsigned int cycles_per_block 		= total_cycles / _grid_size;						
+	unsigned int extra_cycles 			= total_cycles - (cycles_per_block * _grid_size);
+
+	CtaDecomposition work_decomposition = {
+		extra_cycles,										// num_big_blocks
+		(cycles_per_block + 1) * _cycle_elements,			// big_block_elements
+		cycles_per_block * _cycle_elements,					// normal_block_elements
+		_num_elements - (total_cycles * _cycle_elements),	// extra_elements_last_block
+		_num_elements};										// num_elements
+	
+	_work_decomposition = work_decomposition;
+	
+	int spine_cycles = ((_grid_size * (1 << max_radix_bits)) + B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS - 1) / B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
+	_spine_elements = spine_cycles * B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
+}
+
+
+
+template <typename K, typename V>
+int BaseRadixSortingEnactor<K, V>::GridSize(int max_grid_size)
+{
+	const int SINGLE_CTA_CUTOFF = 0;		// right now zero; we have no single-cta sorting
+
+	// find maximum number of threadblocks if "use-default"
+	if (max_grid_size == 0) {
+
+		if (_num_elements <= static_cast<unsigned int>(SINGLE_CTA_CUTOFF)) {
+
+			// The problem size is too small to warrant a two-level reduction: 
+			// use only one stream-processor
+			max_grid_size = 1;
+
+		} else {
+
+			if (_device_sm_version <= 120) {
+				
+				// G80/G90
+				max_grid_size = _device_props.multiProcessorCount * 4;
+				
+			} else if (_device_sm_version < 200) {
+				
+				// GT200 (has some kind of TLB or icache drama)
+				int orig_max_grid_size = _device_props.multiProcessorCount * B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(_kernel_ptx_version);
+				if (_keys_only) { 
+					orig_max_grid_size *= (_num_elements + (1024 * 1024 * 96) - 1) / (1024 * 1024 * 96);
+				} else {
+					orig_max_grid_size *= (_num_elements + (1024 * 1024 * 64) - 1) / (1024 * 1024 * 64);
+				}
+				max_grid_size = orig_max_grid_size;
+
+				if (_num_elements / _cycle_elements > static_cast<unsigned int>(max_grid_size)) {
+	
+					double multiplier1 = 4.0;
+					double multiplier2 = 16.0;
+
+					double delta1 = 0.068;
+					double delta2 = 0.127;	
+	
+					int dividend = (_num_elements + _cycle_elements - 1) / _cycle_elements;
+	
+					while(true) {
+	
+						double quotient = ((double) dividend) / (multiplier1 * max_grid_size);
+						quotient -= (int) quotient;
+
+						if ((quotient > delta1) && (quotient < 1 - delta1)) {
+
+							quotient = ((double) dividend) / (multiplier2 * max_grid_size / 3.0);
+							quotient -= (int) quotient;
+
+							if ((quotient > delta2) && (quotient < 1 - delta2)) {
+								break;
+							}
+						}
+						
+						if (max_grid_size == orig_max_grid_size - 2) {
+							max_grid_size = orig_max_grid_size - 30;
+						} else {
+							max_grid_size -= 1;
+						}
+					}
+				}
+			} else {
+				
+				// GF100
+				max_grid_size = 418;
+			}
+		}
+	}
+
+	// Calculate the actual number of threadblocks to launch.  Initially
+	// assume that each threadblock will do only one cycle_elements worth 
+	// of work, but then clamp it by the "max" restriction derived above
+	// in order to accomodate the "single-sp" and "saturated" cases.
+
+	int grid_size = _num_elements / _cycle_elements;
+	if (grid_size == 0) {
+		grid_size = 1;
+	}
+	if (grid_size > max_grid_size) {
+		grid_size = max_grid_size;
+	} 
+
+	return grid_size;
+}
+
+
+
+template <typename K, typename V>
+bool BaseRadixSortingEnactor<K, V>::
+CanFit() 
+{
+	long long bytes = (_num_elements * sizeof(K) * 2) + (_spine_elements * sizeof(int));
+	if (!_keys_only) bytes += _num_elements * sizeof(V) * 2;
+
+	if (_device_props.totalGlobalMem < 1024 * 1024 * 513) {
+		return (bytes < ((double) _device_props.totalGlobalMem) * 0.81); 	// allow up to 81% capacity for 512MB   
+	}
+	
+	return (bytes < ((double) _device_props.totalGlobalMem) * 0.89); 	// allow up to 90% capacity 
+}
+
+
+
+template <typename K, typename V>
+template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
+cudaError_t BaseRadixSortingEnactor<K, V>::
+DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
+{
+	int threads = B40C_RADIXSORT_THREADS;
+	int dynamic_smem;
+
+	cudaFuncAttributes reduce_kernel_attrs, scan_scatter_attrs;
+	cudaFuncGetAttributes(&reduce_kernel_attrs, RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor>);
+	cudaFuncGetAttributes(&scan_scatter_attrs, ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor>);
+	
+	//
+	// Counting Reduction
+	//
+
+	// Run tesla flush kernel if we have two or more threadblocks for each of the SMs
+	if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
+		FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
+		synchronize_if_enabled("FlushKernel");
+	}
+
+	// GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
+	dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - reduce_kernel_attrs.sharedSizeBytes : 0;
+
+	RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor> <<<_grid_size, threads, dynamic_smem>>>(
+		converted_storage.d_from_alt_storage,
+		converted_storage.d_spine,
+		converted_storage.d_keys,
+		converted_storage.d_alt_keys,
+		_work_decomposition);
+    synchronize_if_enabled("RakingReduction");
+
+	
+	//
+	// Spine
+	//
+	
+	// GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
+	dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - _spine_scan_kernel_attrs.sharedSizeBytes : 0;
+	
+	SrtsScanSpine<void><<<_grid_size, B40C_RADIXSORT_SPINE_THREADS, dynamic_smem>>>(
+		converted_storage.d_spine,
+		converted_storage.d_spine,
+		_spine_elements);
+    synchronize_if_enabled("SrtsScanSpine");
+
+	
+	//
+	// Scanning Scatter
+	//
+	
+	// Run tesla flush kernel if we have two or more threadblocks for each of the SMs
+	if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
+		FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
+		synchronize_if_enabled("FlushKernel");
+	}
+
+	ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor> <<<_grid_size, threads, 0>>>(
+		converted_storage.d_from_alt_storage,
+		converted_storage.d_spine,
+		converted_storage.d_keys,
+		converted_storage.d_alt_keys,
+		converted_storage.d_values,
+		converted_storage.d_alt_values,
+		_work_decomposition);
+    synchronize_if_enabled("ScanScatterDigits");
+
+	return cudaSuccess;
+}
+
+
+
+template <typename K, typename V>
+cudaError_t BaseRadixSortingEnactor<K, V>::
+EnactSort(RadixSortStorage<K, V> &problem_storage) 
+{
+	//
+	// Allocate device memory for temporary storage (if necessary)
+	//
+
+	if (problem_storage.d_alt_keys == NULL) {
+		cudaMalloc((void**) &problem_storage.d_alt_keys, _num_elements * sizeof(K));
+	}
+	if (!_keys_only && (problem_storage.d_alt_values == NULL)) {
+		cudaMalloc((void**) &problem_storage.d_alt_values, _num_elements * sizeof(V));
+	}
+	if (problem_storage.d_spine == NULL) {
+		cudaMalloc((void**) &problem_storage.d_spine, _spine_elements * sizeof(int));
+	}
+	if (problem_storage.d_from_alt_storage == NULL) {
+		cudaMalloc((void**) &problem_storage.d_from_alt_storage, 2 * sizeof(bool));
+	}
+
+	// Determine suitable type of unsigned byte storage to use for keys 
+	typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
+	
+	// Copy storage pointers to an appropriately typed stucture 
+	RadixSortStorage<ConvertedKeyType, V> converted_storage;
+	memcpy(&converted_storage, &problem_storage, sizeof(RadixSortStorage<K, V>));
+
+	// 
+	// Enact the sorting operation
+	//
+	
+	if (RADIXSORT_DEBUG) {
+		
+		printf("_device_sm_version: %d, _kernel_ptx_version: %d\n", _device_sm_version, _kernel_ptx_version);
+		printf("Bottom-level reduction & scan kernels:\n\tgrid_size: %d, \n\tthreads: %d, \n\tcycle_elements: %d, \n\tnum_big_blocks: %d, \n\tbig_block_elements: %d, \n\tnormal_block_elements: %d\n\textra_elements_last_block: %d\n\n",
+			_grid_size, B40C_RADIXSORT_THREADS, _cycle_elements, _work_decomposition.num_big_blocks, _work_decomposition.big_block_elements, _work_decomposition.normal_block_elements, _work_decomposition.extra_elements_last_block);
+		printf("Top-level spine scan:\n\tgrid_size: %d, \n\tthreads: %d, \n\tspine_block_elements: %d\n\n", 
+			_grid_size, B40C_RADIXSORT_SPINE_THREADS, _spine_elements);
+	}	
+
+	cudaError_t retval = EnactDigitPlacePasses(converted_storage);
+
+	
+	//
+	// Swizzle pointers if we left our sorted output in temp storage 
+	//
+	
+	if (_swizzle_pointers_for_odd_passes) {
+	
+		cudaMemcpy(
+			&problem_storage.using_alternate_storage, 
+			&problem_storage.d_from_alt_storage[_passes & 0x1], 
+			sizeof(bool), 
+			cudaMemcpyDeviceToHost);
+	
+		if (problem_storage.using_alternate_storage) {
+            thrust::swap<K*>(problem_storage.d_keys, problem_storage.d_alt_keys);
+			if (!_keys_only) {
+                thrust::swap<V*>(problem_storage.d_values, problem_storage.d_alt_values);
+			}
+		}
+	}
+	
+	return retval;
+}
+
+
+
+
+
+/******************************************************************************
+ * Sorting enactor classes
+ ******************************************************************************/
+
+/**
+ * Generic sorting enactor class.  Simply create an instance of this class
+ * with your key-type K (and optionally value-type V if sorting with satellite 
+ * values).
+ * 
+ * Template specialization provides the appropriate enactor instance to handle 
+ * the specified data types. 
+ * 
+ * @template-param K
+ * 		Type of keys to be sorted
+ *
+ * @template-param V
+ * 		Type of values to be sorted.
+ *
+ * @template-param ConvertedKeyType
+ * 		Leave as default to effect necessary enactor specialization.
+ */
+template <typename K, typename V = KeysOnlyType, typename ConvertedKeyType = typename KeyConversion<K>::UnsignedBits>
+class RadixSortingEnactor;
+
+
+
+/**
+ * Sorting enactor that is specialized for for 8-bit key types
+ */
+template <typename K, typename V>
+class RadixSortingEnactor<K, V, unsigned char> : public BaseRadixSortingEnactor<K, V>
+{
+protected:
+
+	typedef BaseRadixSortingEnactor<K, V> Base; 
+	typedef typename Base::ConvertedKeyType ConvertedKeyType;
+
+	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
+	{
+		Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
+		Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
+
+		return cudaSuccess;
+	}
+
+public:
+	
+	/**
+	 * Constructor.
+	 * 
+	 * @param[in] 		num_elements 
+	 * 		Length (in elements) of the input to a sorting operation
+	 * 
+	 * @param[in] 		max_grid_size  
+	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
+	 * 		that the dispatch logic should select an appropriate value for the target device.
+	 */	
+	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(2, 4, num_elements, max_grid_size) {}
+
+};
+
+
+
+/**
+ * Sorting enactor that is specialized for for 16-bit key types
+ */
+template <typename K, typename V>
+class RadixSortingEnactor<K, V, unsigned short> : public BaseRadixSortingEnactor<K, V>
+{
+protected:
+
+	typedef BaseRadixSortingEnactor<K, V> Base; 
+	typedef typename Base::ConvertedKeyType ConvertedKeyType;
+
+	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
+	{
+		Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
+		Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
+
+		return cudaSuccess;
+	}
+
+public:
+	
+	/**
+	 * Constructor.
+	 * 
+	 * @param[in] 		num_elements 
+	 * 		Length (in elements) of the input to a sorting operation
+	 * 
+	 * @param[in] 		max_grid_size  
+	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
+	 * 		that the dispatch logic should select an appropriate value for the target device.
+	 */	
+	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(4, 4, num_elements, max_grid_size) {}
+
+};
+
+
+/**
+ * Sorting enactor that is specialized for for 32-bit key types
+ */
+template <typename K, typename V>
+class RadixSortingEnactor<K, V, unsigned int> : public BaseRadixSortingEnactor<K, V>
+{
+protected:
+
+	typedef BaseRadixSortingEnactor<K, V> Base; 
+	typedef typename Base::ConvertedKeyType ConvertedKeyType;
+
+	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
+	{
+		Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
+		Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
+
+		return cudaSuccess;
+	}
+
+public:
+	
+	/**
+	 * Constructor.
+	 * 
+	 * @param[in] 		num_elements 
+	 * 		Length (in elements) of the input to a sorting operation
+	 * 
+	 * @param[in] 		max_grid_size  
+	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
+	 * 		that the dispatch logic should select an appropriate value for the target device.
+	 */	
+	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(8, 4, num_elements, max_grid_size) {}
+
+};
+
+
+
+/**
+ * Sorting enactor that is specialized for for 64-bit key types
+ */
+template <typename K, typename V>
+class RadixSortingEnactor<K, V, unsigned long long> : public BaseRadixSortingEnactor<K, V>
+{
+protected:
+
+	typedef BaseRadixSortingEnactor<K, V> Base; 
+	typedef typename Base::ConvertedKeyType ConvertedKeyType;
+
+	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
+	{
+		Base::template DigitPlacePass<0,  4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
+		Base::template DigitPlacePass<1,  4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<2,  4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<3,  4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<4,  4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<5,  4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<6,  4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<7,  4, 28, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<8,  4, 32, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
+		Base::template DigitPlacePass<9,  4, 36, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<10, 4, 40, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<11, 4, 44, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<12, 4, 48, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<13, 4, 52, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<14, 4, 56, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
+		Base::template DigitPlacePass<15, 4, 60, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
+
+		return cudaSuccess;
+	}
+
+public:
+	
+	/**
+	 * Constructor.
+	 * 
+	 * @param[in] 		num_elements 
+	 * 		Length (in elements) of the input to a sorting operation
+	 * 
+	 * @param[in] 		max_grid_size  
+	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
+	 * 		that the dispatch logic should select an appropriate value for the target device.
+	 */	
+	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(16, 4, num_elements, max_grid_size) {}
+
+};
+
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h
new file mode 100644
index 0000000..7899dc3
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h
@@ -0,0 +1,173 @@
+/******************************************************************************
+ * 
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ * 
+ ******************************************************************************/
+
+
+/******************************************************************************
+ * Configuration management for B40C radix sorting kernels  
+ ******************************************************************************/
+
+#pragma once
+
+#include "kernel_utils.h"
+#include "vector_types.h"
+#include "radixsort_key_conversion.h"
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+
+/******************************************************************************
+ * Radix sorting configuration  
+ ******************************************************************************/
+
+// 128 threads
+#define B40C_RADIXSORT_LOG_THREADS						7								
+#define B40C_RADIXSORT_THREADS							(1 << B40C_RADIXSORT_LOG_THREADS)	
+
+// Target threadblock occupancy for counting/reduction kernel
+#define B40C_SM20_REDUCE_CTA_OCCUPANCY()					(8)			// 8 threadblocks on GF100
+#define B40C_SM12_REDUCE_CTA_OCCUPANCY()					(5)			// 5 threadblocks on GT200
+#define B40C_SM10_REDUCE_CTA_OCCUPANCY()					(3)			// 4 threadblocks on G80
+#define B40C_RADIXSORT_REDUCE_CTA_OCCUPANCY(version)		((version >= 200) ? B40C_SM20_REDUCE_CTA_OCCUPANCY() : 	\
+			        										 (version >= 120) ? B40C_SM12_REDUCE_CTA_OCCUPANCY() : 	\
+					        													B40C_SM10_REDUCE_CTA_OCCUPANCY())		
+													                    
+// Target threadblock occupancy for bulk scan/scatter kernel
+#define B40C_SM20_SCAN_SCATTER_CTA_OCCUPANCY()				(7)			// 7 threadblocks on GF100
+#define B40C_SM12_SCAN_SCATTER_CTA_OCCUPANCY()				(5)			// 5 threadblocks on GT200
+#define B40C_SM10_SCAN_SCATTER_CTA_OCCUPANCY()				(2)			// 2 threadblocks on G80
+#define B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(version)	((version >= 200) ? B40C_SM20_SCAN_SCATTER_CTA_OCCUPANCY() : 	\
+			    											 (version >= 120) ? B40C_SM12_SCAN_SCATTER_CTA_OCCUPANCY() : 	\
+				    															B40C_SM10_SCAN_SCATTER_CTA_OCCUPANCY())		
+
+// Number of 256-element sets to rake per raking pass
+#define B40C_SM20_LOG_SETS_PER_PASS()					(1)			// 2 sets on GF100
+#define B40C_SM12_LOG_SETS_PER_PASS()					(0)			// 1 set on GT200
+#define B40C_SM10_LOG_SETS_PER_PASS()					(1)			// 2 sets on G80
+#define B40C_RADIXSORT_LOG_SETS_PER_PASS(version)		((version >= 200) ? B40C_SM20_LOG_SETS_PER_PASS() : 	\
+			     										 (version >= 120) ? B40C_SM12_LOG_SETS_PER_PASS() : 	\
+				    														B40C_SM10_LOG_SETS_PER_PASS())		
+
+// Number of raking passes per cycle
+#define B40C_SM20_LOG_PASSES_PER_CYCLE(K, V)					(((B40C_MAX(sizeof(K), sizeof(V)) > 4) || _B40C_LP64_) ? 0 : 1)	// 2 passes on GF100 (only one for large keys/values, or for 64-bit device pointers)
+#define B40C_SM12_LOG_PASSES_PER_CYCLE(K, V)					(B40C_MAX(sizeof(K), sizeof(V)) > 4 ? 0 : 1)					// 2 passes on GT200 (only for large keys/values)
+#define B40C_SM10_LOG_PASSES_PER_CYCLE(K, V)					(0)																// 1 pass on G80
+#define B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(version, K, V)	((version >= 200) ? B40C_SM20_LOG_PASSES_PER_CYCLE(K, V) : 	\
+				    										 (version >= 120) ? B40C_SM12_LOG_PASSES_PER_CYCLE(K, V) : 	\
+					    														B40C_SM10_LOG_PASSES_PER_CYCLE(K, V))		
+
+
+// Number of raking threads per raking pass
+#define B40C_SM20_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS + 1)		// 2 raking warps on GF100
+#define B40C_SM12_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS)			// 1 raking warp on GT200
+#define B40C_SM10_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS + 2)		// 4 raking warps on G80
+#define B40C_RADIXSORT_LOG_RAKING_THREADS_PER_PASS(version)	((version >= 200) ? B40C_SM20_LOG_RAKING_THREADS_PER_PASS() : 	\
+				    										 (version >= 120) ? B40C_SM12_LOG_RAKING_THREADS_PER_PASS() : 	\
+					    														B40C_SM10_LOG_RAKING_THREADS_PER_PASS())		
+
+
+// Number of elements per cycle
+#define B40C_RADIXSORT_LOG_CYCLE_ELEMENTS(version, K, V)		(B40C_RADIXSORT_LOG_SETS_PER_PASS(version) + B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(version, K, V) + B40C_RADIXSORT_LOG_THREADS + 1)
+#define B40C_RADIXSORT_CYCLE_ELEMENTS(version, K, V)			(1 << B40C_RADIXSORT_LOG_CYCLE_ELEMENTS(version, K, V))
+
+// Number of warps per CTA
+#define B40C_RADIXSORT_LOG_WARPS								(B40C_RADIXSORT_LOG_THREADS - B40C_LOG_WARP_THREADS)
+#define B40C_RADIXSORT_WARPS									(1 << B40C_RADIXSORT_LOG_WARPS)
+
+// Number of threads for spine-scanning kernel
+#define B40C_RADIXSORT_LOG_SPINE_THREADS						7		// 128 threads
+#define B40C_RADIXSORT_SPINE_THREADS							(1 << B40C_RADIXSORT_LOG_SPINE_THREADS)	
+
+// Number of elements per spine-scanning cycle
+#define B40C_RADIXSORT_LOG_SPINE_CYCLE_ELEMENTS  				9		// 512 elements
+#define B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS		    			(1 << B40C_RADIXSORT_LOG_SPINE_CYCLE_ELEMENTS)
+
+
+
+/******************************************************************************
+ * SRTS Control Structures
+ ******************************************************************************/
+
+
+/**
+ * Value-type structure denoting keys-only sorting
+ */
+struct KeysOnlyType {};
+
+/**
+ * Returns whether or not the templated type indicates keys-only sorting
+ */
+template <typename V>
+inline __host__ __device__ bool IsKeysOnly() {return false;}
+
+
+/**
+ * Returns whether or not the templated type indicates keys-only sorting
+ */
+template <>
+inline __host__ __device__ bool IsKeysOnly<KeysOnlyType>() {return true;}
+
+
+/**
+ * A given threadblock may receive one of three different amounts of 
+ * work: "big", "normal", and "last".  The big workloads are one
+ * cycle_elements greater than the normal, and the last workload 
+ * does the extra (problem-size % cycle_elements) work.
+ */
+struct CtaDecomposition {
+	unsigned int num_big_blocks;
+	unsigned int big_block_elements;
+	unsigned int normal_block_elements;
+	unsigned int extra_elements_last_block;
+	unsigned int num_elements;
+};
+
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
new file mode 100644
index 0000000..a170f95
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
@@ -0,0 +1,352 @@
+/******************************************************************************
+ * 
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ * 
+ ******************************************************************************/
+
+
+/******************************************************************************
+ * Functors for converting signed and floating point types to unsigned types
+ * suitable for radix sorting  
+ ******************************************************************************/
+
+#pragma once
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+
+//
+// Do-nothing functors
+//
+
+template <typename T>
+struct NopFunctor{
+    template <typename ConvertedKeyType>
+	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
+};
+
+//
+// Do-nothing functors that indicate a mandatory pass
+//
+
+template <typename T>
+struct MandatoryPassNopFunctor{
+    template <typename ConvertedKeyType>
+	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
+};
+
+
+//
+// Conversion for generic unsigned types
+//
+
+template <typename T> struct KeyConversion {
+	typedef T UnsignedBits;
+};
+
+template <typename T>
+struct PreprocessKeyFunctor{
+    template <typename ConvertedKeyType>
+	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
+};
+
+template <typename T>
+struct PostprocessKeyFunctor {
+    template <typename ConvertedKeyType>
+	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
+};
+
+
+
+//
+// Conversion for floats
+//
+
+template <> struct KeyConversion<float> {
+	typedef unsigned int UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<float> {
+	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
+
+		unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000; 
+		converted_key ^= mask;
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<float> {
+	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
+
+		unsigned int mask = (converted_key & 0x80000000) ? 0x80000000 : 0xffffffff; 
+		converted_key ^= mask;
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+
+//
+// Conversion for doubles
+//
+
+template <> struct KeyConversion<double> {
+	typedef unsigned long long UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<double> {
+	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key) {
+
+		unsigned long long mask = (converted_key & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; 
+		converted_key ^= mask;
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<double> {
+	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key)  {
+		unsigned long long mask = (converted_key & 0x8000000000000000) ? 0x8000000000000000 : 0xffffffffffffffff; 
+        converted_key ^= mask;
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+//
+// Conversion for signed chars
+//
+
+template <> struct KeyConversion<char> {
+  typedef unsigned char UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<char> {
+  __device__ __host__ __forceinline__ void operator()(unsigned char &converted_key) {
+    // char is unsigned on some platforms, so we have to check
+    if(std::numeric_limits<char>::is_signed)
+    {
+      const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
+      converted_key ^= SIGN_MASK;	
+    }
+  }
+  __device__ __host__ __forceinline__ static bool MustApply(){ return std::numeric_limits<char>::is_signed;}
+};
+
+template <>
+struct PostprocessKeyFunctor<char> {
+  __device__ __host__ __forceinline__ void operator()(unsigned char &converted_key)  {
+    // char is unsigned on some platforms, so we have to check
+    if(std::numeric_limits<char>::is_signed)
+    {
+      const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
+      converted_key ^= SIGN_MASK;	
+    }
+  }
+  __device__ __host__ __forceinline__ static bool MustApply(){ return std::numeric_limits<char>::is_signed;}
+};
+
+
+// TODO handle this more gracefully
+template <> struct KeyConversion<signed char> {
+	typedef unsigned char UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<signed char> {
+	__device__ __host__ __forceinline__ void operator()(unsigned char &converted_key) {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<signed char> {
+	__device__ __host__ __forceinline__ void operator()(unsigned char &converted_key)  {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+//
+// Conversion for signed shorts
+//
+
+template <> struct KeyConversion<short> {
+	typedef unsigned short UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<short> {
+	__device__ __host__ __forceinline__ void operator()(unsigned short &converted_key) {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(short) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<short> {
+	__device__ __host__ __forceinline__ void operator()(unsigned short &converted_key)  {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(short) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+
+//
+// Conversion for signed ints
+//
+
+template <> struct KeyConversion<int> {
+	typedef unsigned int UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<int> {
+	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<int> {
+	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key)  {
+		const unsigned int SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+
+//
+// Conversion for signed longs
+//
+
+// TODO rework this with metaprogramming
+template <> struct KeyConversion<unsigned long> {
+#if ULONG_MAX == UINT_MAX
+    typedef unsigned int UnsignedBits;
+#else
+    typedef unsigned long long UnsignedBits;
+#endif
+};
+
+// TODO rework this with metaprogramming
+template <> struct KeyConversion<long> {
+#if ULONG_MAX == UINT_MAX
+    typedef unsigned int UnsignedBits;
+#else
+    typedef unsigned long long UnsignedBits;
+#endif
+};
+
+template <>
+struct PreprocessKeyFunctor<long> {
+	__device__ __host__ __forceinline__ void operator()(typename KeyConversion<long>::UnsignedBits& converted_key) {
+		const typename KeyConversion<long>::UnsignedBits SIGN_MASK = 1ul << ((sizeof(long) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<long> {
+	__device__ __host__ __forceinline__ void operator()(typename KeyConversion<long>::UnsignedBits& converted_key) {
+		const typename KeyConversion<long>::UnsignedBits SIGN_MASK = 1ul << ((sizeof(long) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+
+//
+// Conversion for signed long longs 
+//
+
+template <> struct KeyConversion<long long> {
+	typedef unsigned long long UnsignedBits;
+};
+
+template <>
+struct PreprocessKeyFunctor<long long> {
+	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key) {
+		const unsigned long long SIGN_MASK = 1ull << ((sizeof(long long) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+	}
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+template <>
+struct PostprocessKeyFunctor<long long> {
+	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key)  {
+		const unsigned long long SIGN_MASK = 1ull << ((sizeof(long long) * 8) - 1);
+		converted_key ^= SIGN_MASK;	
+    }
+	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
+};
+
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h
new file mode 100644
index 0000000..a8f91d3
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h
@@ -0,0 +1,439 @@
+/******************************************************************************
+ * 
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ * 
+ ******************************************************************************/
+
+
+/******************************************************************************
+ * Bottom-level digit-reduction/counting kernel
+ ******************************************************************************/
+
+#pragma once
+
+#include "radixsort_kernel_common.h"
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+/******************************************************************************
+ * Defines
+ ******************************************************************************/
+
+const int BYTE_ENCODE_SHIFT = 0x3;
+
+
+/******************************************************************************
+ * Cycle-processing Routines
+ ******************************************************************************/
+
+__device__ __forceinline__ int DecodeInt(int encoded, int quad_byte){
+	return (encoded >> quad_byte) & 0xff;		// shift right 8 bits per digit and return rightmost 8 bits
+}
+
+
+__device__ __forceinline__ int EncodeInt(int count, int quad_byte) {
+	return count << quad_byte;					// shift left 8 bits per digit
+}
+
+
+template <typename K, long long RADIX_DIGITS, int BIT>
+__device__ __forceinline__ void DecodeDigit(
+	K key, 
+	int &lane, 
+	int &quad_shift) 
+{
+	const K DIGIT_MASK = RADIX_DIGITS - 1;
+	lane = (key & (DIGIT_MASK << BIT)) >> (BIT + 2);
+	
+	const K QUAD_MASK = (RADIX_DIGITS < 4) ? 0x1 : 0x3;
+	if (BIT == 32) {
+		// N.B.: This takes one more instruction than the code below it, but 
+		// otherwise the compiler goes nuts and shoves hundreds of bytes 
+		// to lmem when bit = 32 on 64-bit keys.		
+		quad_shift = ((key >> BIT) & QUAD_MASK) << BYTE_ENCODE_SHIFT;	
+	} else {
+		quad_shift = MagnitudeShift<K, BYTE_ENCODE_SHIFT - BIT>(key & (QUAD_MASK << BIT));
+	}
+}
+
+
+template <int RADIX_DIGITS, int SCAN_LANES, int LANES_PER_WARP, int BIT, bool FINAL_REDUCE>
+__device__ __forceinline__ void ReduceEncodedCounts(
+	int local_counts[LANES_PER_WARP][4],
+	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS]) 
+{
+	const int LOG_PARTIALS_PER_THREAD = B40C_RADIXSORT_LOG_THREADS - B40C_LOG_WARP_THREADS;
+	const int PARTIALS_PER_THREAD = 1 << LOG_PARTIALS_PER_THREAD;
+	
+	int encoded;
+	int idx = threadIdx.x & (B40C_WARP_THREADS - 1);
+	
+	
+	__syncthreads();
+
+	#pragma unroll
+	for (int j = 0; j < (int) LANES_PER_WARP; j++) {
+		
+		int warp_id = (threadIdx.x >> B40C_LOG_WARP_THREADS) + (j * B40C_RADIXSORT_WARPS);
+		if (warp_id < SCAN_LANES) {
+
+			// rest of my elements
+			#pragma unroll
+			for (int i = 0; i < (int) PARTIALS_PER_THREAD; i++) {
+				encoded = encoded_carry[warp_id][idx + (i * B40C_WARP_THREADS)];		
+				local_counts[j][0] += DecodeInt(encoded, 0u << BYTE_ENCODE_SHIFT);
+				local_counts[j][1] += DecodeInt(encoded, 1u << BYTE_ENCODE_SHIFT);
+				local_counts[j][2] += DecodeInt(encoded, 2u << BYTE_ENCODE_SHIFT);
+				local_counts[j][3] += DecodeInt(encoded, 3u << BYTE_ENCODE_SHIFT);
+			}
+			
+			if (FINAL_REDUCE) {
+				// reduce all four packed fields, leaving them in the first four elements of our row
+				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][0], local_counts[j][0]);
+				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][1], local_counts[j][1]);
+				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][2], local_counts[j][2]);
+				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][3], local_counts[j][3]);
+			}
+		}
+	}	
+
+	__syncthreads();
+	
+}
+	
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+__device__ __forceinline__ void Bucket(
+	K input, 
+	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS],
+	PreprocessFunctor preprocess = PreprocessFunctor()) 
+{
+	int lane, quad_shift;
+	preprocess(input);
+	DecodeDigit<K, RADIX_DIGITS, BIT>(input, lane, quad_shift);
+	encoded_carry[lane][threadIdx.x] += EncodeInt(1, quad_shift);
+}
+
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor, int CYCLES>
+struct LoadOp;
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1> 
+{
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		K key = d_in_keys[offset + threadIdx.x];
+		Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(key, encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2> 
+{
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 1), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4> 
+{
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 2), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8> 
+{
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+			K keys[8];
+				
+			keys[0] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 0) + threadIdx.x];
+			keys[1] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 1) + threadIdx.x];
+			keys[2] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 2) + threadIdx.x];
+			keys[3] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 3) + threadIdx.x];
+
+			if (B40C_FERMI(__CUDA_ARCH__)) __syncthreads();
+			
+			keys[4] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 4) + threadIdx.x];
+			keys[5] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 5) + threadIdx.x];
+			keys[6] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 6) + threadIdx.x];
+			keys[7] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 7) + threadIdx.x];
+			
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[0], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[1], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[2], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[3], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[4], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[5], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[6], encoded_carry);
+			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[7], encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16> {
+
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 8), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32> {
+
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 16), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64> {
+
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 32), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 128> {
+
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 64), encoded_carry);
+	}
+};
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
+struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 252> {
+
+	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+	{
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 128>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 128), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 192), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 224), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 240), encoded_carry);
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 248), encoded_carry);
+	}
+};
+
+
+template <int SCAN_LANES>
+__device__ __forceinline__ void ResetEncodedCarry(
+	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
+{
+	#pragma unroll
+	for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES; SCAN_LANE++) {
+		encoded_carry[SCAN_LANE][threadIdx.x] = 0;
+	}
+}
+
+
+template <typename K, int RADIX_DIGITS, int SCAN_LANES, int LANES_PER_WARP, int BIT, typename PreprocessFunctor>
+__device__ __forceinline__ int ProcessLoads(
+	K *d_in_keys,
+	int loads,
+	int &offset,
+	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS],
+	int local_counts[LANES_PER_WARP][4])
+{
+	// Unroll batches of loads with occasional reduction to avoid overflow
+	while (loads >= 32) {
+	
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset, encoded_carry);
+		offset += B40C_RADIXSORT_THREADS * 32;
+		loads -= 32;
+
+		// Reduce int local count registers to prevent overflow
+		ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, false>(
+				local_counts, 
+				encoded_carry);
+		
+		// Reset encoded counters
+		ResetEncodedCarry<SCAN_LANES>(encoded_carry);
+	} 
+	
+	int retval = loads;
+	
+	// Wind down loads in decreasing batch sizes
+
+	while (loads >= 4) {
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4>::BlockOfLoads(d_in_keys, offset, encoded_carry);
+		offset += B40C_RADIXSORT_THREADS * 4;
+		loads -= 4;
+	} 
+
+	while (loads) {
+		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset, encoded_carry);
+		offset += B40C_RADIXSORT_THREADS * 1;
+		loads--;
+	}
+	
+	return retval;
+}
+
+
+/******************************************************************************
+ * Reduction/counting Kernel Entry Point
+ ******************************************************************************/
+
+template <typename K, typename V, int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor>
+__launch_bounds__ (B40C_RADIXSORT_THREADS, B40C_RADIXSORT_REDUCE_CTA_OCCUPANCY(__CUDA_ARCH__))
+__global__ 
+void RakingReduction(
+	bool *d_from_alt_storage,
+	int *d_spine,
+	K *d_in_keys,
+	K *d_out_keys,
+	CtaDecomposition work_decomposition)
+{
+	const int RADIX_DIGITS 		= 1 << RADIX_BITS;
+
+	const int LOG_SCAN_LANES 		= (RADIX_BITS >= 2) ? RADIX_BITS - 2 : 0;	// Always at least one fours group
+	const int SCAN_LANES 			= 1 << LOG_SCAN_LANES;
+
+	const int LOG_LANES_PER_WARP 	= (SCAN_LANES > B40C_RADIXSORT_WARPS) ? LOG_SCAN_LANES - B40C_RADIXSORT_LOG_WARPS : 0;	// Always at least one fours group per warp
+	const int LANES_PER_WARP 		= 1 << LOG_LANES_PER_WARP;
+	
+	
+	// Each thread gets its own column of fours-groups (for conflict-free updates)
+	__shared__ int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS];			
+
+	// Each thread is also responsible for aggregating an unencoded segment of a fours-group
+	int local_counts[LANES_PER_WARP][4];								
+
+	// Determine where to read our input
+	bool from_alt_storage = (PASS == 0) ? false : d_from_alt_storage[PASS & 0x1];
+	if (from_alt_storage) d_in_keys = d_out_keys;
+	
+	// Calculate our threadblock's range
+	int offset, block_elements;
+	if (blockIdx.x < work_decomposition.num_big_blocks) {
+		offset = work_decomposition.big_block_elements * blockIdx.x;
+		block_elements = work_decomposition.big_block_elements;
+	} else {
+		offset = (work_decomposition.normal_block_elements * blockIdx.x) + (work_decomposition.num_big_blocks * B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V));
+		block_elements = work_decomposition.normal_block_elements;
+	}
+	
+	// Initialize local counts
+	#pragma unroll 
+	for (int LANE = 0; LANE < (int) LANES_PER_WARP; LANE++) {
+		local_counts[LANE][0] = 0;
+		local_counts[LANE][1] = 0;
+		local_counts[LANE][2] = 0;
+		local_counts[LANE][3] = 0;
+	}
+	
+	// Reset encoded counters
+	ResetEncodedCarry<SCAN_LANES>(encoded_carry);
+	
+	// Process loads
+	int loads = block_elements >> B40C_RADIXSORT_LOG_THREADS;
+	int unreduced_loads = ProcessLoads<K, RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, PreprocessFunctor>(
+		d_in_keys,
+		loads,
+		offset,
+		encoded_carry,
+		local_counts);
+	
+	// Cleanup if we're the last block  
+	if ((blockIdx.x == gridDim.x - 1) && (work_decomposition.extra_elements_last_block)) {
+
+		const int LOADS_PER_CYCLE = B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) / B40C_RADIXSORT_THREADS;
+		
+		// If extra guarded loads may cause overflow, reduce now and reset counters
+		if (unreduced_loads + LOADS_PER_CYCLE > 255) {
+		
+			ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, false>(
+					local_counts, 
+					encoded_carry);
+			
+			ResetEncodedCarry<SCAN_LANES>(encoded_carry);
+		}
+		
+		// perform up to LOADS_PER_CYCLE extra guarded loads
+		#pragma unroll
+		for (int EXTRA_LOAD = 0; EXTRA_LOAD < (int) LOADS_PER_CYCLE; EXTRA_LOAD++) {
+			if (threadIdx.x + (B40C_RADIXSORT_THREADS * EXTRA_LOAD) < work_decomposition.extra_elements_last_block) {
+				K key = d_in_keys[offset + (B40C_RADIXSORT_THREADS * EXTRA_LOAD) + threadIdx.x];
+				Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(key, encoded_carry);
+			}
+		}
+	}
+	
+	// Aggregate 
+	ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, true>(
+		local_counts, 
+		encoded_carry);
+
+	// Write carry in parallel (carries per row are in the first four bytes of each row) 
+	if (threadIdx.x < RADIX_DIGITS) {
+
+		int row = threadIdx.x >> 2;		
+		int col = threadIdx.x & 3;			 
+		d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = encoded_carry[row][col];
+	}
+} 
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h
new file mode 100644
index 0000000..1377999
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h
@@ -0,0 +1,1207 @@
+/******************************************************************************
+ * 
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ * 
+ ******************************************************************************/
+
+
+/******************************************************************************
+// Bottom-level digit scanning/scattering kernel
+ ******************************************************************************/
+
+#pragma once
+
+#include "radixsort_kernel_common.h"
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+/******************************************************************************
+ * Appropriate substitutes to use for out-of-bounds key (and value) offsets 
+ ******************************************************************************/
+
+template <typename T> 
+__device__ __forceinline__ T DefaultextraValue() {
+	return T();
+}
+
+template <> 
+__device__ __forceinline__ unsigned char DefaultextraValue<unsigned char>() {
+	return (unsigned char) -1;
+}
+
+template <> 
+__device__ __forceinline__ unsigned short DefaultextraValue<unsigned short>() {
+	return (unsigned short) -1;
+}
+
+template <> 
+__device__ __forceinline__ unsigned int DefaultextraValue<unsigned int>() {
+	return (unsigned int) -1u;
+}
+
+template <> 
+__device__ __forceinline__ unsigned long DefaultextraValue<unsigned long>() {
+	return (unsigned long) -1ul;
+}
+
+template <> 
+__device__ __forceinline__ unsigned long long DefaultextraValue<unsigned long long>() {
+	return (unsigned long long) -1ull;
+}
+
+
+/******************************************************************************
+ * Cycle-processing Routines
+ ******************************************************************************/
+
+template <typename K, long long RADIX_DIGITS, int BIT>
+__device__ __forceinline__ int DecodeDigit(K key) 
+{
+	const K DIGIT_MASK = RADIX_DIGITS - 1;
+	return (key >> BIT) & DIGIT_MASK;
+}
+
+
+template <typename K, long long RADIX_DIGITS, int BIT, int PADDED_PARTIALS_PER_LANE>
+__device__ __forceinline__ void DecodeDigit(
+	K key, 
+	int &digit, 
+	int &flag_offset,		// in bytes
+	const int SET_OFFSET)
+{
+	const int PADDED_BYTES_PER_LANE 	= PADDED_PARTIALS_PER_LANE * 4;
+	const int SET_OFFSET_BYTES 		= SET_OFFSET * 4;
+	const K QUAD_MASK 							= (RADIX_DIGITS < 4) ? 0x1 : 0x3;
+	
+	digit = DecodeDigit<K, RADIX_DIGITS, BIT>(key);
+	int lane = digit >> 2;
+	int quad_byte = digit & QUAD_MASK;
+
+	flag_offset = SET_OFFSET_BYTES + FastMul(lane, PADDED_BYTES_PER_LANE) + quad_byte;
+}
+
+
+template <typename K, long long RADIX_DIGITS, int BIT, int SETS_PER_PASS, int SCAN_LANES_PER_SET, int PADDED_PARTIALS_PER_LANE>
+__device__ __forceinline__ void DecodeDigits(
+	typename VecType<K, 2>::Type keypairs[SETS_PER_PASS],
+	int2 digits[SETS_PER_PASS],
+	int2 flag_offsets[SETS_PER_PASS])		// in bytes 
+{
+
+	#pragma unroll
+	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+		
+		const int SET_OFFSET = SET * SCAN_LANES_PER_SET * PADDED_PARTIALS_PER_LANE;
+
+		DecodeDigit<K, RADIX_DIGITS, BIT, PADDED_PARTIALS_PER_LANE>(
+				keypairs[SET].x, digits[SET].x, flag_offsets[SET].x, SET_OFFSET);
+		
+		DecodeDigit<K, RADIX_DIGITS, BIT, PADDED_PARTIALS_PER_LANE>(
+				keypairs[SET].y, digits[SET].y, flag_offsets[SET].y, SET_OFFSET);
+	}
+}
+
+
+template <typename T, typename PreprocessFunctor>
+__device__ __forceinline__ void GuardedReadSet(
+	T *in, 
+	typename VecType<T, 2>::Type &pair,
+	int offset,
+	int extra[1],
+	PreprocessFunctor preprocess = PreprocessFunctor())				
+{
+	if (offset - extra[0] < 0) {
+		pair.x = in[offset];
+		preprocess(pair.x);
+	} else {
+		pair.x = DefaultextraValue<T>();
+	}
+	
+	if (offset + 1 - extra[0] < 0) {
+		pair.y = in[offset + 1];
+		preprocess(pair.y);
+	} else {
+		pair.y = DefaultextraValue<T>();
+	}
+}
+
+
+template <typename T, bool UNGUARDED_IO, int SETS_PER_PASS, typename PreprocessFunctor>
+__device__ __forceinline__ void ReadSets(
+	typename VecType<T, 2>::Type *d_in, 
+	typename VecType<T, 2>::Type pairs[SETS_PER_PASS],
+	const int BASE2,
+	int extra[1],
+	PreprocessFunctor preprocess = PreprocessFunctor())				
+{
+	if (UNGUARDED_IO) {
+
+		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler makes it 1% slower
+		if (SETS_PER_PASS > 0) pairs[0] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 0)];
+		if (SETS_PER_PASS > 1) pairs[1] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 1)];
+		if (SETS_PER_PASS > 2) pairs[2] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 2)];
+		if (SETS_PER_PASS > 3) pairs[3] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 3)];
+
+		#pragma unroll 
+		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+			preprocess(pairs[SET].x);
+			preprocess(pairs[SET].y);
+		}
+		
+	} else {
+
+		T* in = (T*) d_in;
+		
+		// N.B. --  I wish we could do some pragma unrolling here, but the compiler won't let 
+		// us with user-defined value types (e.g., Fribbitz): "Advisory: Loop was not unrolled, cannot deduce loop trip count"
+		
+		if (SETS_PER_PASS > 0) GuardedReadSet<T, PreprocessFunctor>(in, pairs[0], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 0), extra);
+		if (SETS_PER_PASS > 1) GuardedReadSet<T, PreprocessFunctor>(in, pairs[1], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 1), extra);
+		if (SETS_PER_PASS > 2) GuardedReadSet<T, PreprocessFunctor>(in, pairs[2], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 2), extra);
+		if (SETS_PER_PASS > 3) GuardedReadSet<T, PreprocessFunctor>(in, pairs[3], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 3), extra);
+	}
+}
+
+
+template <int SETS_PER_PASS>
+__device__ __forceinline__ void PlacePartials(
+	unsigned char * base_partial,
+	int2 digits[SETS_PER_PASS],
+	int2 flag_offsets[SETS_PER_PASS]) 
+{
+	#pragma unroll
+	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+		base_partial[flag_offsets[SET].x] = 1;
+		base_partial[flag_offsets[SET].y] = 1 + (digits[SET].x == digits[SET].y);
+	}
+}
+
+
+template <int SETS_PER_PASS>
+__device__ __forceinline__ void ExtractRanks(
+	unsigned char * base_partial,
+	int2 digits[SETS_PER_PASS],
+	int2 flag_offsets[SETS_PER_PASS],
+	int2 ranks[SETS_PER_PASS]) 
+{
+	#pragma unroll
+	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+		ranks[SET].x = base_partial[flag_offsets[SET].x];
+		ranks[SET].y = base_partial[flag_offsets[SET].y] + (digits[SET].x == digits[SET].y);
+	}
+}
+
+
+template <int RADIX_DIGITS, int SETS_PER_PASS>
+__device__ __forceinline__ void UpdateRanks(
+	int2 digits[SETS_PER_PASS],
+	int2 ranks[SETS_PER_PASS],
+	int digit_counts[SETS_PER_PASS][RADIX_DIGITS])
+{
+	// N.B.: I wish we could pragma unroll here, but doing so currently 
+	// results in the 3.1 compilier on 64-bit platforms generating bad
+	// code for SM1.3, resulting in incorrect sorting (e.g., problem size 16)
+	
+	if (SETS_PER_PASS > 0) {
+		ranks[0].x += digit_counts[0][digits[0].x];
+		ranks[0].y += digit_counts[0][digits[0].y]; 
+	}	
+	if (SETS_PER_PASS > 1) {
+		ranks[1].x += digit_counts[1][digits[1].x];
+		ranks[1].y += digit_counts[1][digits[1].y]; 
+	}	
+	if (SETS_PER_PASS > 2) {
+		ranks[2].x += digit_counts[2][digits[2].x];
+		ranks[2].y += digit_counts[2][digits[2].y]; 
+	}	
+	if (SETS_PER_PASS > 3) {
+		ranks[3].x += digit_counts[3][digits[3].x];
+		ranks[3].y += digit_counts[3][digits[3].y]; 
+	}	
+}
+
+template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS>
+__device__ __forceinline__ void UpdateRanks(
+	int2 digits[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS])
+{
+	// N.B.: I wish we could pragma unroll here, but doing so currently 
+	// results in the 3.1 compilier on 64-bit platforms generating bad
+	// code for SM1.3, resulting in incorrect sorting (e.g., problem size 16)
+	
+	if (PASSES_PER_CYCLE > 0) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[0], ranks[0], digit_counts[0]);
+	if (PASSES_PER_CYCLE > 1) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[1], ranks[1], digit_counts[1]);
+	if (PASSES_PER_CYCLE > 2) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[2], ranks[2], digit_counts[2]);
+	if (PASSES_PER_CYCLE > 3) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[3], ranks[3], digit_counts[3]);
+}
+
+
+
+template <int SCAN_LANES_PER_PASS, int LOG_RAKING_THREADS_PER_LANE, int RAKING_THREADS_PER_LANE, int PARTIALS_PER_SEG>
+__device__ __forceinline__ void PrefixScanOverLanes(
+	int 	raking_segment[],
+	int 	warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
+	int 	copy_section)
+{
+	// Upsweep rake
+	int partial_reduction = SerialReduce<PARTIALS_PER_SEG>(raking_segment);
+
+	// Warpscan reduction in digit warpscan_lane
+	int warpscan_lane = threadIdx.x >> LOG_RAKING_THREADS_PER_LANE;
+	int group_prefix = WarpScan<RAKING_THREADS_PER_LANE, true>(
+		warpscan[warpscan_lane], 
+		partial_reduction,
+		copy_section);
+
+	// Downsweep rake
+	SerialScan<PARTIALS_PER_SEG>(raking_segment, group_prefix);
+	
+}
+
+
+template <int SCAN_LANES_PER_PASS, int RAKING_THREADS_PER_LANE, int SETS_PER_PASS, int SCAN_LANES_PER_SET>
+__device__ __forceinline__ void RecoverDigitCounts(
+	int warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
+	int counts[SETS_PER_PASS],
+	int copy_section)
+{
+	int my_lane = threadIdx.x >> 2;
+	int my_quad_byte = threadIdx.x & 3;
+	
+	#pragma unroll
+	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+		unsigned char *warpscan_count = (unsigned char *) &warpscan[my_lane + (SCAN_LANES_PER_SET * SET)][1 + copy_section][RAKING_THREADS_PER_LANE - 1];
+		counts[SET] = warpscan_count[my_quad_byte];
+	}
+}
+
+template<int RADIX_DIGITS>
+__device__ __forceinline__ void CorrectUnguardedSetOverflow(
+	int2 			set_digits,
+	int 	&set_count)				
+{
+	if (WarpVoteAll(RADIX_DIGITS, set_count <= 1)) {
+		// All first-pass, first set keys have same digit. 
+		set_count = (threadIdx.x == set_digits.x) ? 256 : 0;
+	}
+}
+
+template <int RADIX_DIGITS, int SETS_PER_PASS>
+__device__ __forceinline__ void CorrectUnguardedPassOverflow(
+	int2 			pass_digits[SETS_PER_PASS],
+	int 	pass_counts[SETS_PER_PASS])				
+{
+	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+	// telling me "Advisory: Loop was not unrolled, unexpected call OPs"
+
+	if (SETS_PER_PASS > 0) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[0], pass_counts[0]);
+	if (SETS_PER_PASS > 1) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[1], pass_counts[1]);
+	if (SETS_PER_PASS > 2) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[2], pass_counts[2]);
+	if (SETS_PER_PASS > 3) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[3], pass_counts[3]);
+}
+
+
+template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS>
+__device__ __forceinline__ void CorrectUnguardedCycleOverflow(
+	int2 			cycle_digits[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int 	cycle_counts[PASSES_PER_CYCLE][SETS_PER_PASS])
+{
+	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+	// telling me "Advisory: Loop was not unrolled, unexpected call OPs"
+
+	if (PASSES_PER_CYCLE > 0) CorrectUnguardedPassOverflow<RADIX_DIGITS, SETS_PER_PASS>(cycle_digits[0], cycle_counts[0]);
+	if (PASSES_PER_CYCLE > 1) CorrectUnguardedPassOverflow<RADIX_DIGITS, SETS_PER_PASS>(cycle_digits[1], cycle_counts[1]);
+}
+
+
+template <int RADIX_DIGITS>
+__device__ __forceinline__ void CorrectLastLaneOverflow(int &count, int extra[1]) 
+{
+	if (WarpVoteAll(RADIX_DIGITS, count == 0) && (threadIdx.x == RADIX_DIGITS - 1)) {
+		// We're 'f' and we overflowed b/c of invalid 'f' placemarkers; the number of valid items in this set is the count of valid f's 
+		count = extra[0] & 255;
+	}
+}
+		
+
+template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS, int SETS_PER_CYCLE, bool UNGUARDED_IO>
+__device__ __forceinline__ void CorrectForOverflows(
+	int2 digits[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int counts[PASSES_PER_CYCLE][SETS_PER_PASS], 
+	int extra[1])				
+{
+	if (!UNGUARDED_IO) {
+
+		// Correct any overflow in the partially-filled last lane
+		int *linear_counts = (int *) counts;
+		CorrectLastLaneOverflow<RADIX_DIGITS>(linear_counts[SETS_PER_CYCLE - 1], extra);
+	}
+
+	CorrectUnguardedCycleOverflow<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS>(digits, counts);
+}
+
+
+template <
+	typename K,
+	int BIT, 
+	int RADIX_DIGITS,
+	int SCAN_LANES_PER_SET,
+	int SETS_PER_PASS,
+	int RAKING_THREADS_PER_PASS,
+	int SCAN_LANES_PER_PASS,
+	int LOG_RAKING_THREADS_PER_LANE,
+	int RAKING_THREADS_PER_LANE,
+	int PARTIALS_PER_SEG,
+	int PADDED_PARTIALS_PER_LANE,
+	int PASSES_PER_CYCLE>
+__device__ __forceinline__ void ScanPass(
+	int *base_partial,
+	int	*raking_partial,
+	int warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
+	typename VecType<K, 2>::Type keypairs[SETS_PER_PASS],
+	int2 digits[SETS_PER_PASS],
+	int2 flag_offsets[SETS_PER_PASS],
+	int2 ranks[SETS_PER_PASS],
+	int copy_section)
+{
+	// Reset smem
+	#pragma unroll
+	for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES_PER_PASS; SCAN_LANE++) {
+		base_partial[SCAN_LANE * PADDED_PARTIALS_PER_LANE] = 0;
+	}
+	
+	// Decode digits for first pass
+	DecodeDigits<K, RADIX_DIGITS, BIT, SETS_PER_PASS, SCAN_LANES_PER_SET, PADDED_PARTIALS_PER_LANE>(
+		keypairs, digits, flag_offsets);
+	
+	// Encode counts into smem for first pass
+	PlacePartials<SETS_PER_PASS>(
+		(unsigned char *) base_partial,
+		digits,
+		flag_offsets); 
+	
+	__syncthreads();
+	
+	// Intra-group prefix scans for first pass
+	if (threadIdx.x < RAKING_THREADS_PER_PASS) {
+	
+		PrefixScanOverLanes<SCAN_LANES_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, PARTIALS_PER_SEG>(		// first pass is offset right by one
+			raking_partial,
+			warpscan, 
+			copy_section);
+	}
+	
+	__syncthreads();
+
+	// Extract ranks
+	ExtractRanks<SETS_PER_PASS>(
+		(unsigned char *) base_partial, 
+		digits, 
+		flag_offsets, 
+		ranks); 	
+}	
+	
+
+/******************************************************************************
+ * SM1.3 Local Exchange Routines
+ * 
+ * Routines for exchanging keys (and values) in shared memory (i.e., local 
+ * scattering) in order to to facilitate coalesced global scattering
+ ******************************************************************************/
+
+template <typename T, bool UNGUARDED_IO, int PASSES_PER_CYCLE, int SETS_PER_PASS, typename PostprocessFunctor>
+__device__ __forceinline__ void ScatterSets(
+	T *d_out, 
+	typename VecType<T, 2>::Type pairs[SETS_PER_PASS],
+	int2 offsets[SETS_PER_PASS],
+	const int BASE4,
+	int extra[1],
+	PostprocessFunctor postprocess = PostprocessFunctor())				
+{
+	#pragma unroll 
+	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+		postprocess(pairs[SET].x);
+		postprocess(pairs[SET].y);
+	}
+
+	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler makes it 1% slower 
+		
+	if (SETS_PER_PASS > 0) { 
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 0) < extra[0])) 
+			d_out[offsets[0].x] = pairs[0].x;
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 1) < extra[0])) 
+			d_out[offsets[0].y] = pairs[0].y;
+	}
+
+	if (SETS_PER_PASS > 1) { 
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 2) < extra[0])) 
+			d_out[offsets[1].x] = pairs[1].x;
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 3) < extra[0])) 
+			d_out[offsets[1].y] = pairs[1].y;
+	}
+
+	if (SETS_PER_PASS > 2) { 
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 4) < extra[0])) 
+			d_out[offsets[2].x] = pairs[2].x;
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 5) < extra[0])) 
+			d_out[offsets[2].y] = pairs[2].y;
+	}
+
+	if (SETS_PER_PASS > 3) { 
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 6) < extra[0])) 
+			d_out[offsets[3].x] = pairs[3].x;
+		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 7) < extra[0])) 
+			d_out[offsets[3].y] = pairs[3].y;
+	}
+}
+
+template <typename T, int PASSES_PER_CYCLE, int SETS_PER_PASS>
+__device__ __forceinline__ void PushPairs(
+	T *swap, 
+	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS])				
+{
+	#pragma unroll 
+	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+	
+		#pragma unroll 
+		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+			swap[ranks[PASS][SET].x] = pairs[PASS][SET].x;
+			swap[ranks[PASS][SET].y] = pairs[PASS][SET].y;
+		}
+	}
+}
+	
+template <typename T, int PASSES_PER_CYCLE, int SETS_PER_PASS>
+__device__ __forceinline__ void ExchangePairs(
+	T *swap, 
+	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS])				
+{
+	// Push in Pairs
+	PushPairs<T, PASSES_PER_CYCLE, SETS_PER_PASS>(swap, pairs, ranks);
+	
+	__syncthreads();
+	
+	// Extract pairs
+	#pragma unroll 
+	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+		
+		#pragma unroll 
+		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+			const int BLOCK = ((PASS * SETS_PER_PASS) + SET) * 2;
+			pairs[PASS][SET].x = swap[threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 0))];
+			pairs[PASS][SET].y = swap[threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 1))];
+		}
+	}
+}
+
+
+template <
+	typename K,
+	typename V,	
+	int RADIX_DIGITS, 
+	int BIT, 
+	int PASSES_PER_CYCLE,
+	int SETS_PER_PASS,
+	bool UNGUARDED_IO,
+	typename PostprocessFunctor>
+__device__ __forceinline__ void SwapAndScatterSm13(
+	typename VecType<K, 2>::Type keypairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int4 *exchange,
+	typename VecType<V, 2>::Type *d_in_values, 
+	K *d_out_keys, 
+	V *d_out_values, 
+	int carry[RADIX_DIGITS], 
+	int extra[1])				
+{
+	int2 offsets[PASSES_PER_CYCLE][SETS_PER_PASS];
+	
+	// Swap keys according to ranks
+	ExchangePairs<K, PASSES_PER_CYCLE, SETS_PER_PASS>((K*) exchange, keypairs, ranks);				
+	
+	// Calculate scatter offsets (re-decode digits from keys: it's less work than making a second exchange of digits) 
+	#pragma unroll 
+	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+		
+		#pragma unroll 
+		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+			const int BLOCK = ((PASS * SETS_PER_PASS) + SET) * 2;
+			offsets[PASS][SET].x = threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 0)) + carry[DecodeDigit<K, RADIX_DIGITS, BIT>(keypairs[PASS][SET].x)];
+			offsets[PASS][SET].y = threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 1)) + carry[DecodeDigit<K, RADIX_DIGITS, BIT>(keypairs[PASS][SET].y)];
+		}
+	}
+	
+	// Scatter keys
+	#pragma unroll 
+	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+		const int BLOCK = PASS * SETS_PER_PASS * 2;
+		ScatterSets<K, UNGUARDED_IO, PASSES_PER_CYCLE, SETS_PER_PASS, PostprocessFunctor>(d_out_keys, keypairs[PASS], offsets[PASS], B40C_RADIXSORT_THREADS * BLOCK, extra);
+	}
+
+	if (!IsKeysOnly<V>()) {
+	
+		__syncthreads();
+
+		// Read input data
+		typename VecType<V, 2>::Type datapairs[PASSES_PER_CYCLE][SETS_PER_PASS];
+
+		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+		// telling me "Advisory: Loop was not unrolled, unexpected control flow"
+
+		if (PASSES_PER_CYCLE > 0) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);
+		if (PASSES_PER_CYCLE > 1) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra);
+		
+		// Swap data according to ranks
+		ExchangePairs<V, PASSES_PER_CYCLE, SETS_PER_PASS>((V*) exchange, datapairs, ranks);
+		
+		// Scatter data
+		#pragma unroll 
+		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+			const int BLOCK = PASS * SETS_PER_PASS * 2;
+			ScatterSets<V, UNGUARDED_IO, PASSES_PER_CYCLE, SETS_PER_PASS, NopFunctor<V> >(d_out_values, datapairs[PASS], offsets[PASS], B40C_RADIXSORT_THREADS * BLOCK, extra);
+		}
+	}
+}
+
+
+/******************************************************************************
+ * SM1.0 Local Exchange Routines
+ *
+ * Routines for exchanging keys (and values) in shared memory (i.e., local 
+ * scattering) in order to to facilitate coalesced global scattering
+ ******************************************************************************/
+
+template <
+	typename T, 
+	int RADIX_DIGITS,
+	bool UNGUARDED_IO,
+	typename PostprocessFunctor> 
+__device__ __forceinline__ void ScatterPass(
+	T *swapmem,
+	T *d_out, 
+	int digit_scan[2][RADIX_DIGITS], 
+	int carry[RADIX_DIGITS], 
+	int extra[1],
+	int base_digit,				
+	PostprocessFunctor postprocess = PostprocessFunctor())				
+{
+	const int LOG_STORE_TXN_THREADS = B40C_LOG_MEM_BANKS(__CUDA_ARCH__);
+	const int STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS;
+	
+	int store_txn_idx = threadIdx.x & (STORE_TXN_THREADS - 1);
+	int store_txn_digit = threadIdx.x >> LOG_STORE_TXN_THREADS;
+	
+	int my_digit = base_digit + store_txn_digit;
+	if (my_digit < RADIX_DIGITS) {
+	
+		int my_exclusive_scan = digit_scan[1][my_digit - 1];
+		int my_inclusive_scan = digit_scan[1][my_digit];
+		int my_digit_count = my_inclusive_scan - my_exclusive_scan;
+
+		int my_carry = carry[my_digit] + my_exclusive_scan;
+		int my_aligned_offset = store_txn_idx - (my_carry & (STORE_TXN_THREADS - 1));
+		
+		while (my_aligned_offset < my_digit_count) {
+
+			if ((my_aligned_offset >= 0) && (UNGUARDED_IO || (my_exclusive_scan + my_aligned_offset < extra[0]))) { 
+			
+				T datum = swapmem[my_exclusive_scan + my_aligned_offset];
+				postprocess(datum);
+				d_out[my_carry + my_aligned_offset] = datum;
+			}
+			my_aligned_offset += STORE_TXN_THREADS;
+		}
+	}
+}
+
+template <
+	typename T,
+	int RADIX_DIGITS, 
+	int PASSES_PER_CYCLE,
+	int SETS_PER_PASS,
+	bool UNGUARDED_IO,
+	typename PostprocessFunctor>
+__device__ __forceinline__ void SwapAndScatterPairs(
+	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
+	T *exchange,
+	T *d_out, 
+	int carry[RADIX_DIGITS], 
+	int digit_scan[2][RADIX_DIGITS], 
+	int extra[1])				
+{
+	const int SCATTER_PASS_DIGITS = B40C_RADIXSORT_WARPS * (B40C_WARP_THREADS / B40C_MEM_BANKS(__CUDA_ARCH__));
+	const int SCATTER_PASSES = RADIX_DIGITS / SCATTER_PASS_DIGITS;
+
+	// Push in pairs
+	PushPairs<T, PASSES_PER_CYCLE, SETS_PER_PASS>(exchange, pairs, ranks);
+
+	__syncthreads();
+
+	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+	// telling me "Advisory: Loop was not unrolled, not an innermost loop"
+
+	if (SCATTER_PASSES > 0) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 0);
+	if (SCATTER_PASSES > 1) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 1);
+	if (SCATTER_PASSES > 2) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 2);
+	if (SCATTER_PASSES > 3) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 3);
+	if (SCATTER_PASSES > 4) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 4);
+	if (SCATTER_PASSES > 5) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 5);
+	if (SCATTER_PASSES > 6) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 6);
+	if (SCATTER_PASSES > 7) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 7);
+}
+
+
+template <
+	typename K,
+	typename V,	
+	int RADIX_DIGITS, 
+	int PASSES_PER_CYCLE,
+	int SETS_PER_PASS,
+	bool UNGUARDED_IO,
+	typename PostprocessFunctor>
+__device__ __forceinline__ void SwapAndScatterSm10(
+	typename VecType<K, 2>::Type keypairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
+	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
+	int4 *exchange,
+	typename VecType<V, 2>::Type *d_in_values, 
+	K *d_out_keys, 
+	V *d_out_values, 
+	int carry[RADIX_DIGITS], 
+	int digit_scan[2][RADIX_DIGITS], 
+	int extra[1])				
+{
+	// Swap and scatter keys
+	SwapAndScatterPairs<K, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
+		keypairs, ranks, (K*) exchange, d_out_keys, carry, digit_scan, extra);				
+	
+	if (!IsKeysOnly<V>()) {
+
+		__syncthreads();
+		
+		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+		// telling me "Advisory: Loop was not unrolled, unexpected control flow"
+
+		// Read input data
+		typename VecType<V, 2>::Type datapairs[PASSES_PER_CYCLE][SETS_PER_PASS];
+		if (PASSES_PER_CYCLE > 0) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);
+		if (PASSES_PER_CYCLE > 1) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra);
+
+		// Swap and scatter data
+		SwapAndScatterPairs<V, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, NopFunctor<V> >(
+			datapairs, ranks, (V*) exchange, d_out_values, carry, digit_scan, extra);				
+	}
+}
+
+
+/******************************************************************************
+ * Cycle of RADIXSORT_CYCLE_ELEMENTS keys (and values)
+ ******************************************************************************/
+
+template <
+	typename K,
+	typename V,	
+	int BIT, 
+	bool UNGUARDED_IO,
+	int RADIX_DIGITS,
+	int LOG_SCAN_LANES_PER_SET,
+	int SCAN_LANES_PER_SET,
+	int SETS_PER_PASS,
+	int PASSES_PER_CYCLE,
+	int LOG_SCAN_LANES_PER_PASS,
+	int SCAN_LANES_PER_PASS,
+	int LOG_PARTIALS_PER_LANE,
+	int LOG_PARTIALS_PER_PASS,
+	int LOG_RAKING_THREADS_PER_PASS,
+	int RAKING_THREADS_PER_PASS,
+	int LOG_RAKING_THREADS_PER_LANE,
+	int RAKING_THREADS_PER_LANE,
+	int LOG_PARTIALS_PER_SEG,
+	int PARTIALS_PER_SEG,
+	int LOG_PARTIALS_PER_ROW,
+	int PARTIALS_PER_ROW,
+	int LOG_SEGS_PER_ROW,	
+	int SEGS_PER_ROW,
+	int LOG_ROWS_PER_SET,
+	int LOG_ROWS_PER_LANE,
+	int ROWS_PER_LANE,
+	int LOG_ROWS_PER_PASS,
+	int ROWS_PER_PASS,
+	int MAX_EXCHANGE_BYTES,
+	typename PreprocessFunctor,
+	typename PostprocessFunctor>
+
+__device__ __forceinline__ void SrtsScanDigitCycle(
+	typename VecType<K, 2>::Type *d_in_keys, 
+	typename VecType<V, 2>::Type *d_in_values, 
+	K *d_out_keys, 
+	V *d_out_values, 
+	int4 *exchange,								
+	int	warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
+	int	carry[RADIX_DIGITS],
+	int	digit_scan[2][RADIX_DIGITS],						 
+	int	digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS],
+	int	extra[1],
+	int	*base_partial,
+	int	*raking_partial)		
+{
+	
+	const int PADDED_PARTIALS_PER_LANE 		= ROWS_PER_LANE * (PARTIALS_PER_ROW + 1);	 
+	const int SETS_PER_CYCLE 				= PASSES_PER_CYCLE * SETS_PER_PASS;
+
+	// N.B.: We use the following voodoo incantations to elide the compiler's miserable 
+	// "declared but never referenced" warnings for these (which are actually used for 
+	// template instantiation)	
+	SuppressUnusedConstantWarning(PADDED_PARTIALS_PER_LANE);
+	SuppressUnusedConstantWarning(SETS_PER_CYCLE);
+	
+	typename VecType<K, 2>::Type 	keypairs[PASSES_PER_CYCLE][SETS_PER_PASS];
+	int2 							digits[PASSES_PER_CYCLE][SETS_PER_PASS];
+	int2 							flag_offsets[PASSES_PER_CYCLE][SETS_PER_PASS];		// a byte offset
+	int2 							ranks[PASSES_PER_CYCLE][SETS_PER_PASS];
+
+	
+	//-------------------------------------------------------------------------
+	// Read keys
+	//-------------------------------------------------------------------------
+
+	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
+	// telling me "Advisory: Loop was not unrolled, unexpected control flow construct"
+	
+	// Read Keys
+	if (PASSES_PER_CYCLE > 0) ReadSets<K, UNGUARDED_IO, SETS_PER_PASS, PreprocessFunctor>(d_in_keys, keypairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);		 
+	if (PASSES_PER_CYCLE > 1) ReadSets<K, UNGUARDED_IO, SETS_PER_PASS, PreprocessFunctor>(d_in_keys, keypairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra); 	
+	
+	//-------------------------------------------------------------------------
+	// Lane-scanning Passes
+	//-------------------------------------------------------------------------
+
+	#pragma unroll
+	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+	
+		// First Pass
+		ScanPass<K, BIT, RADIX_DIGITS, SCAN_LANES_PER_SET, SETS_PER_PASS, RAKING_THREADS_PER_PASS, SCAN_LANES_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, PARTIALS_PER_SEG, PADDED_PARTIALS_PER_LANE, PASSES_PER_CYCLE>(
+			base_partial,
+			raking_partial,
+			warpscan,
+			keypairs[PASS],
+			digits[PASS],
+			flag_offsets[PASS],
+			ranks[PASS],
+			PASSES_PER_CYCLE - PASS - 1);		// lower passes get copied right
+	}
+	
+	//-------------------------------------------------------------------------
+	// Digit-scanning 
+	//-------------------------------------------------------------------------
+
+	// Recover second-half digit-counts, scan across all digit-counts
+	if (threadIdx.x < RADIX_DIGITS) {
+
+		int counts[PASSES_PER_CYCLE][SETS_PER_PASS];
+
+		// Recover digit-counts
+
+		#pragma unroll
+		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+			RecoverDigitCounts<SCAN_LANES_PER_PASS, RAKING_THREADS_PER_LANE, SETS_PER_PASS, SCAN_LANES_PER_SET>(		// first pass, offset by 1			
+				warpscan, 
+				counts[PASS],
+				PASSES_PER_CYCLE - PASS - 1);		// lower passes get copied right
+		}
+		
+		// Check for overflows
+		CorrectForOverflows<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, SETS_PER_CYCLE, UNGUARDED_IO>(
+				digits, counts, extra);
+
+		// Scan across my digit counts for each set 
+		int exclusive_total = 0;
+		int inclusive_total = 0;
+		
+		#pragma unroll
+		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+		
+			#pragma unroll
+			for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+				inclusive_total += counts[PASS][SET];
+				counts[PASS][SET] = exclusive_total;
+				exclusive_total = inclusive_total;
+			}
+		}
+
+		// second half of carry update
+		int my_carry = carry[threadIdx.x] + digit_scan[1][threadIdx.x];
+
+		// Perform overflow-free SIMD Kogge-Stone across digits
+		int digit_prefix = WarpScan<RADIX_DIGITS, false>(
+				digit_scan, 
+				inclusive_total,
+				0);
+
+		// first-half of carry update 
+		carry[threadIdx.x] = my_carry - digit_prefix;
+		
+		#pragma unroll
+		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
+
+			#pragma unroll
+			for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
+				digit_counts[PASS][SET][threadIdx.x] = counts[PASS][SET] + digit_prefix;
+			}
+		}
+	}
+	
+	__syncthreads();
+
+	//-------------------------------------------------------------------------
+	// Update Ranks
+	//-------------------------------------------------------------------------
+
+	UpdateRanks<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS>(digits, ranks, digit_counts);
+	
+	
+	//-------------------------------------------------------------------------
+	// Scatter 
+	//-------------------------------------------------------------------------
+
+#if ((__CUDA_ARCH__ < 130) || FERMI_ECC)		
+
+	SwapAndScatterSm10<K, V, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
+		keypairs, 
+		ranks,
+		exchange,
+		d_in_values, 
+		d_out_keys, 
+		d_out_values, 
+		carry, 
+		digit_scan,
+		extra);
+	
+#else 
+
+	SwapAndScatterSm13<K, V, RADIX_DIGITS, BIT, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
+		keypairs, 
+		ranks,
+		exchange,
+		d_in_values, 
+		d_out_keys, 
+		d_out_values, 
+		carry, 
+		extra);
+	
+#endif
+
+	__syncthreads();
+
+}
+
+
+
+/******************************************************************************
+ * Scan/Scatter Kernel Entry Point
+ ******************************************************************************/
+
+template <
+	typename K, 
+	typename V, 
+	int PASS, 
+	int RADIX_BITS, 
+	int BIT, 
+	typename PreprocessFunctor, 
+	typename PostprocessFunctor>
+__launch_bounds__ (B40C_RADIXSORT_THREADS, B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(__CUDA_ARCH__))
+__global__ 
+void ScanScatterDigits(
+	bool *d_from_alt_storage,
+	int* d_spine,
+	K* d_in_keys,
+	K* d_out_keys,
+	V* d_in_values,
+	V* d_out_values,
+	CtaDecomposition work_decomposition)
+{
+
+	const int RADIX_DIGITS 				= 1 << RADIX_BITS;
+	
+	const int LOG_SCAN_LANES_PER_SET	= (RADIX_BITS > 2) ? RADIX_BITS - 2 : 0;					// Always at one lane per set
+	const int SCAN_LANES_PER_SET		= 1 << LOG_SCAN_LANES_PER_SET;								// N.B.: we have "declared but never referenced" warnings for these, but they're actually used for template instantiation
+	
+	const int LOG_SETS_PER_PASS			= B40C_RADIXSORT_LOG_SETS_PER_PASS(__CUDA_ARCH__);			
+	const int SETS_PER_PASS				= 1 << LOG_SETS_PER_PASS;
+	
+	const int LOG_PASSES_PER_CYCLE		= B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(__CUDA_ARCH__, K, V);			
+	const int PASSES_PER_CYCLE			= 1 << LOG_PASSES_PER_CYCLE;
+
+	const int LOG_SCAN_LANES_PER_PASS	= LOG_SETS_PER_PASS + LOG_SCAN_LANES_PER_SET;
+	const int SCAN_LANES_PER_PASS		= 1 << LOG_SCAN_LANES_PER_PASS;
+	
+	const int LOG_PARTIALS_PER_LANE 	= B40C_RADIXSORT_LOG_THREADS;
+	
+	const int LOG_PARTIALS_PER_PASS		= LOG_SCAN_LANES_PER_PASS + LOG_PARTIALS_PER_LANE;
+
+	const int LOG_RAKING_THREADS_PER_PASS 		= B40C_RADIXSORT_LOG_RAKING_THREADS_PER_PASS(__CUDA_ARCH__);
+	const int RAKING_THREADS_PER_PASS			= 1 << LOG_RAKING_THREADS_PER_PASS;
+
+	const int LOG_RAKING_THREADS_PER_LANE 		= LOG_RAKING_THREADS_PER_PASS - LOG_SCAN_LANES_PER_PASS;
+	const int RAKING_THREADS_PER_LANE 			= 1 << LOG_RAKING_THREADS_PER_LANE;
+
+	const int LOG_PARTIALS_PER_SEG 		= LOG_PARTIALS_PER_LANE - LOG_RAKING_THREADS_PER_LANE;
+	const int PARTIALS_PER_SEG 			= 1 << LOG_PARTIALS_PER_SEG;
+
+	const int LOG_PARTIALS_PER_ROW		= (LOG_PARTIALS_PER_SEG < B40C_LOG_MEM_BANKS(__CUDA_ARCH__)) ? B40C_LOG_MEM_BANKS(__CUDA_ARCH__) : LOG_PARTIALS_PER_SEG;		// floor of MEM_BANKS partials per row
+	const int PARTIALS_PER_ROW			= 1 << LOG_PARTIALS_PER_ROW;
+	const int PADDED_PARTIALS_PER_ROW 	= PARTIALS_PER_ROW + 1;
+
+	const int LOG_SEGS_PER_ROW 			= LOG_PARTIALS_PER_ROW - LOG_PARTIALS_PER_SEG;	
+	const int SEGS_PER_ROW				= 1 << LOG_SEGS_PER_ROW;
+
+	const int LOG_ROWS_PER_SET 			= LOG_PARTIALS_PER_PASS - LOG_PARTIALS_PER_ROW;
+
+	const int LOG_ROWS_PER_LANE 		= LOG_PARTIALS_PER_LANE - LOG_PARTIALS_PER_ROW;
+	const int ROWS_PER_LANE 			= 1 << LOG_ROWS_PER_LANE;
+
+	const int LOG_ROWS_PER_PASS 		= LOG_SCAN_LANES_PER_PASS + LOG_ROWS_PER_LANE;
+	const int ROWS_PER_PASS 			= 1 << LOG_ROWS_PER_PASS;
+	
+	const int SCAN_LANE_BYTES			= ROWS_PER_PASS * PADDED_PARTIALS_PER_ROW * sizeof(int);
+	const int MAX_EXCHANGE_BYTES		= (sizeof(K) > sizeof(V)) ? 
+													B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) * sizeof(K) : 
+													B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) * sizeof(V);
+	const int SCAN_LANE_INT4S         = (B40C_MAX(MAX_EXCHANGE_BYTES, SCAN_LANE_BYTES) + sizeof(int4) - 1) / sizeof(int4);
+
+
+	// N.B.: We use the following voodoo incantations to elide the compiler's miserable 
+	// "declared but never referenced" warnings for these (which are actually used for 
+	// template instantiation)	
+	SuppressUnusedConstantWarning(SCAN_LANES_PER_SET);
+	SuppressUnusedConstantWarning(PARTIALS_PER_SEG);
+	SuppressUnusedConstantWarning(LOG_ROWS_PER_SET);
+	SuppressUnusedConstantWarning(ROWS_PER_LANE);
+
+    // scan_lanes is a int4[] to avoid alignment issues when casting to (K *) and/or (V *)
+	__shared__ int4		scan_lanes[SCAN_LANE_INT4S];
+	__shared__ int 		warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE];		// One warpscan per fours-group
+	__shared__ int 		carry[RADIX_DIGITS];
+	__shared__ int 		digit_scan[2][RADIX_DIGITS];						 
+	__shared__ int 		digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS];
+	__shared__ bool 	non_trivial_digit_pass;
+	__shared__ bool		from_alt_storage;
+	
+	_B40C_REG_MISER_QUALIFIER_ int extra[1];
+	_B40C_REG_MISER_QUALIFIER_ int oob[1];
+
+	extra[0] = (blockIdx.x == gridDim.x - 1) ? work_decomposition.extra_elements_last_block : 0;
+
+	// calculate our threadblock's range
+	int block_elements, block_offset;
+	if (blockIdx.x < work_decomposition.num_big_blocks) {
+		block_offset = work_decomposition.big_block_elements * blockIdx.x;
+		block_elements = work_decomposition.big_block_elements;
+	} else {
+		block_offset = (work_decomposition.normal_block_elements * blockIdx.x) + (work_decomposition.num_big_blocks * B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V));
+		block_elements = work_decomposition.normal_block_elements;
+	}
+	oob[0] = block_offset + block_elements;	// out-of-bounds
+
+	
+	// location for placing 2-element partial reductions in the first lane of a pass	
+	int row = threadIdx.x >> LOG_PARTIALS_PER_ROW; 
+	int col = threadIdx.x & (PARTIALS_PER_ROW - 1); 
+	int *base_partial = reinterpret_cast<int *>(scan_lanes) + (row * PADDED_PARTIALS_PER_ROW) + col; 								
+	
+	// location for raking across all sets within a pass
+	int *raking_partial = 0;										
+
+	if (threadIdx.x < RAKING_THREADS_PER_PASS) {
+
+		// initalize lane warpscans
+		if (threadIdx.x < RAKING_THREADS_PER_LANE) {
+			
+			#pragma unroll
+			for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES_PER_PASS; SCAN_LANE++) {
+				warpscan[SCAN_LANE][0][threadIdx.x] = 0;
+			}
+		}
+
+		// initialize digit warpscans
+		if (threadIdx.x < RADIX_DIGITS) {
+
+			// Initialize digit_scan
+			digit_scan[0][threadIdx.x] = 0;
+			digit_scan[1][threadIdx.x] = 0;
+
+			// Determine where to read our input
+			from_alt_storage = (PASS == 0) ? false : d_from_alt_storage[PASS & 0x1];
+
+			// Read carry in parallel 
+			int spine_digit_offset = FastMul(gridDim.x, threadIdx.x);
+			int my_digit_carry = d_spine[spine_digit_offset + blockIdx.x];
+			carry[threadIdx.x] = my_digit_carry;
+
+			// Determine whether or not we have work to do and setup the next round 
+			// accordingly.  Everybody but the first threadblock can determine this 
+			// from the number of non-zero-and-non-oob digit carries.  First block 
+			// needs someone else's because he always writes the zero offset.
+			
+			int predicate;
+			if (PreprocessFunctor::MustApply() || PostprocessFunctor::MustApply()) {
+
+				non_trivial_digit_pass = true;
+
+			} else {
+
+				if (blockIdx.x > 0) {
+					// Non-first CTA : use digit-carry from first block
+					my_digit_carry = d_spine[spine_digit_offset];
+				}
+				
+				predicate = ((my_digit_carry > 0) && (my_digit_carry < work_decomposition.num_elements));
+				non_trivial_digit_pass = (TallyWarpVote(RADIX_DIGITS, predicate, reinterpret_cast<int *>(scan_lanes)) > 0);
+			}
+
+			// Let the next round know which set of buffers to use
+			if (blockIdx.x == 0) d_from_alt_storage[(PASS + 1) & 0x1] = from_alt_storage ^ non_trivial_digit_pass;
+		}
+
+		// initialize raking segment
+		row = threadIdx.x >> LOG_SEGS_PER_ROW;
+		col = (threadIdx.x & (SEGS_PER_ROW - 1)) << LOG_PARTIALS_PER_SEG;
+		raking_partial = reinterpret_cast<int *>(scan_lanes) + (row * PADDED_PARTIALS_PER_ROW) + col; 
+	}
+
+	// Sync to acquire non_trivial_digit_pass and from_temp_storage
+	__syncthreads();
+	
+	// Short-circuit this entire pass
+	if (!non_trivial_digit_pass) return; 
+
+	if (!from_alt_storage) {
+	
+		// Scan in tiles of cycle_elements
+		while (block_offset < oob[0]) {
+	
+			SrtsScanDigitCycle<K, V, BIT, true, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
+				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_in_keys[block_offset]), 
+				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_in_values[block_offset]), 
+				d_out_keys, 
+				d_out_values, 
+				scan_lanes,
+				warpscan,
+				carry,
+				digit_scan,						 
+				digit_counts,
+				extra,
+				base_partial,
+				raking_partial);		
+	
+			block_offset += B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V);
+		}
+	
+		if (extra[0]) {
+			
+			SrtsScanDigitCycle<K, V, BIT, false, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
+				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_in_keys[block_offset]), 
+				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_in_values[block_offset]), 
+				d_out_keys, 
+				d_out_values, 
+				scan_lanes,
+				warpscan,
+				carry,
+				digit_scan,						 
+				digit_counts,
+				extra,
+				base_partial,
+				raking_partial);		
+		}
+
+	} else {
+		
+		// Scan in tiles of cycle_elements
+		while (block_offset < oob[0]) {
+
+			SrtsScanDigitCycle<K, V, BIT, true, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
+				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_out_keys[block_offset]), 
+				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_out_values[block_offset]), 
+				d_in_keys, 
+				d_in_values, 
+				scan_lanes,
+				warpscan,
+				carry,
+				digit_scan,						 
+				digit_counts,
+				extra,
+				base_partial,
+				raking_partial);		
+
+			block_offset += B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V);
+		}
+
+		if (extra[0]) {
+			
+			SrtsScanDigitCycle<K, V, BIT, false, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
+				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_out_keys[block_offset]), 
+				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_out_values[block_offset]), 
+				d_in_keys, 
+				d_in_values, 
+				scan_lanes,
+				warpscan,
+				carry,
+				digit_scan,						 
+				digit_counts,
+				extra,
+				base_partial,
+				raking_partial);		
+		}
+		
+	}
+}
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h
new file mode 100644
index 0000000..3d20f4a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h
@@ -0,0 +1,187 @@
+/******************************************************************************
+ * 
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ * 
+ ******************************************************************************/
+
+
+/******************************************************************************
+ * Top-level histogram/spine scanning kernel
+ ******************************************************************************/
+
+#pragma once
+
+#include "radixsort_kernel_common.h"
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+/******************************************************************************
+ * Scans a cycle of RADIXSORT_CYCLE_ELEMENTS elements 
+ ******************************************************************************/
+
+template<int PARTIALS_PER_SEG>
+__device__ __forceinline__ void SrtsScanCycle(
+	int *smem_offset,
+	int *smem_segment,
+	int warpscan[2][B40C_WARP_THREADS],
+	int4 *in, 
+	int4 *out,
+	int &carry)
+{
+	int4 datum; 
+
+	// read input data
+	datum = in[threadIdx.x];
+
+	smem_offset[0] = datum.x + datum.y + datum.z + datum.w;
+
+	__syncthreads();
+
+	if (threadIdx.x < B40C_WARP_THREADS) {
+
+		int partial_reduction = SerialReduce<PARTIALS_PER_SEG>(smem_segment);
+
+		int seed = WarpScan<B40C_WARP_THREADS, false>(warpscan, partial_reduction, 0);
+		seed += carry;		
+		
+		SerialScan<PARTIALS_PER_SEG>(smem_segment, seed);
+
+		carry += warpscan[1][B40C_WARP_THREADS - 1];	
+	}
+
+	__syncthreads();
+
+	int part0 = smem_offset[0];
+	int part1;
+
+	part1 = datum.x + part0;
+	datum.x = part0;
+	part0 = part1 + datum.y;
+	datum.y = part1;
+
+	part1 = datum.z + part0;
+	datum.z = part0;
+	part0 = part1 + datum.w;
+	datum.w = part1;
+	
+	out[threadIdx.x] = datum;
+}
+
+
+/******************************************************************************
+ * Spine/histogram Scan Kernel Entry Point
+ ******************************************************************************/
+
+template <typename T>
+__global__ void SrtsScanSpine(
+	int *d_ispine,
+	int *d_ospine,
+	int normal_block_elements)
+{
+	const int LOG_PARTIALS				= B40C_RADIXSORT_LOG_THREADS;				
+	const int PARTIALS			 		= 1 << LOG_PARTIALS;
+	
+	const int LOG_PARTIALS_PER_SEG 		= LOG_PARTIALS - B40C_LOG_WARP_THREADS;
+	const int PARTIALS_PER_SEG 			= 1 << LOG_PARTIALS_PER_SEG;
+
+	const int LOG_PARTIALS_PER_ROW		= (LOG_PARTIALS_PER_SEG < B40C_LOG_MEM_BANKS(__CUDA_ARCH__)) ? B40C_LOG_MEM_BANKS(__CUDA_ARCH__) : LOG_PARTIALS_PER_SEG;		// floor of 32 elts per row
+	const int PARTIALS_PER_ROW			= 1 << LOG_PARTIALS_PER_ROW;
+	
+	const int LOG_SEGS_PER_ROW 			= LOG_PARTIALS_PER_ROW - LOG_PARTIALS_PER_SEG;	
+	const int SEGS_PER_ROW				= 1 << LOG_SEGS_PER_ROW;
+
+	const int SMEM_ROWS 				= PARTIALS / PARTIALS_PER_ROW;
+	
+	__shared__ int smem[SMEM_ROWS][PARTIALS_PER_ROW + 1];
+	__shared__ int warpscan[2][B40C_WARP_THREADS];
+
+  // WAR spurious unused constant warning
+  SuppressUnusedConstantWarning(PARTIALS_PER_SEG);
+
+	int *smem_segment = 0;
+	int carry = 0;
+
+	int row = threadIdx.x >> LOG_PARTIALS_PER_ROW;		
+	int col = threadIdx.x & (PARTIALS_PER_ROW - 1);			
+	int *smem_offset = &smem[row][col];
+
+	if (blockIdx.x > 0) {
+		return;
+	}
+	
+	if (threadIdx.x < B40C_WARP_THREADS) {
+		
+		// two segs per row, odd segs are offset by 8
+		row = threadIdx.x >> LOG_SEGS_PER_ROW;
+		col = (threadIdx.x & (SEGS_PER_ROW - 1)) << LOG_PARTIALS_PER_SEG;
+		smem_segment = &smem[row][col];
+	
+		if (threadIdx.x < B40C_WARP_THREADS) {
+			carry = 0;
+			warpscan[0][threadIdx.x] = 0;
+		}
+	}
+
+	// scan the spine in blocks of cycle_elements
+	int block_offset = 0;
+	while (block_offset < normal_block_elements) {
+		
+		SrtsScanCycle<PARTIALS_PER_SEG>(	
+			smem_offset, 
+			smem_segment, 
+			warpscan,
+			reinterpret_cast<int4 *>((void *) &d_ispine[block_offset]), 
+			reinterpret_cast<int4 *>((void *) &d_ospine[block_offset]), 
+			carry);
+
+		block_offset += B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
+	}
+} 
+
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h b/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h
new file mode 100644
index 0000000..6db7931
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ */
+
+#pragma once
+
+#include <vector_types.h>
+
+namespace thrust  {
+namespace system  {
+namespace cuda    {
+namespace detail  {
+namespace detail  {
+namespace b40c_thrust   {
+
+//------------------------------------------------------------------------------
+// Vector types
+//------------------------------------------------------------------------------
+
+template <typename K, int vec_elements> struct VecType;
+
+
+//
+// Define general vector types
+//
+
+template <typename K> 
+struct VecType<K, 1> {
+	K x;
+	typedef K Type;
+};
+
+template <typename K> 
+struct VecType<K, 2> {
+	K x;
+	K y;
+	typedef VecType<K, 2> Type;
+};
+
+template <typename K> 
+struct VecType<K, 4> {
+	K x;
+	K y;
+	K z;
+	K w;
+	typedef VecType<K, 4> Type;
+};
+
+//
+// Specialize certain built-in vector types
+//
+
+#define B40C_DEFINE_VECTOR_TYPE(base_type,short_type)                           \
+  template<> struct VecType<base_type, 1> { typedef short_type##1 Type; };      \
+  template<> struct VecType<base_type, 2> { typedef short_type##2 Type; };      \
+  template<> struct VecType<base_type, 4> { typedef short_type##4 Type; };     
+
+B40C_DEFINE_VECTOR_TYPE(char,               char)
+B40C_DEFINE_VECTOR_TYPE(short,              short)
+B40C_DEFINE_VECTOR_TYPE(int,                int)
+B40C_DEFINE_VECTOR_TYPE(long,               long)
+B40C_DEFINE_VECTOR_TYPE(long long,          longlong)
+B40C_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+B40C_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+B40C_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+B40C_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+B40C_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+B40C_DEFINE_VECTOR_TYPE(float,              float)
+B40C_DEFINE_VECTOR_TYPE(double,             double)
+
+#undef B40C_DEFINE_VECTOR_TYPE
+
+} // end namespace b40c_thrust
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/balanced_path.h b/compat/thrust/system/cuda/detail/detail/balanced_path.h
new file mode 100644
index 0000000..51e4f5b
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/balanced_path.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+#include <thrust/detail/minmax.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace balanced_path_detail
+{
+
+template<bool UpperBound, typename IntT, typename It, typename T, typename Comp>
+__host__ __device__ void BinarySearchIteration(It data, int& begin, int& end,
+	T key, int shift, Comp comp) {
+
+	IntT scale = (1<< shift) - 1;
+	int mid = (int)((begin + scale * end)>> shift);
+
+	T key2 = data[mid];
+	bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
+	if(pred) begin = (int)mid + 1;
+	else end = mid;
+}
+
+template<bool UpperBound, typename T, typename It, typename Comp>
+__host__ __device__ int BinarySearch(It data, int count, T key, Comp comp) {
+	int begin = 0;
+	int end = count;
+	while(begin < end) 
+		BinarySearchIteration<UpperBound, int>(data, begin, end, key, 1, comp);
+	return begin;
+}
+
+template<bool UpperBound, typename IntT, typename T, typename It, typename Comp>
+__host__ __device__ int BiasedBinarySearch(It data, int count, T key, 
+	IntT levels, Comp comp) {
+	int begin = 0;
+	int end = count;
+
+	if(levels >= 4 && begin < end)
+		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
+	if(levels >= 3 && begin < end)
+		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 7, comp);
+	if(levels >= 2 && begin < end)
+		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 5, comp);
+	if(levels >= 1 && begin < end)
+		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 4, comp);
+
+	while(begin < end)
+		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 1, comp);
+	return begin;
+}
+
+template<bool UpperBound, typename It1, typename It2, typename Comp>
+__host__ __device__ int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, Comp comp)
+{
+  typedef typename thrust::iterator_traits<It1>::value_type T;
+  
+  int begin = thrust::max(0, diag - bCount);
+  int end   = thrust::min(diag, aCount);
+  
+  while(begin < end) 
+  {
+    int mid = (begin + end)>> 1;
+    T aKey = a[mid];
+    T bKey = b[diag - 1 - mid];
+    bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
+    if(pred) begin = mid + 1;
+    else end = mid;
+  }
+  return begin;
+}
+
+
+} // end namespace balanced_path_detail
+
+
+template<typename RandomAccessIterator1, typename Size1, typename RandomAccessIterator2, typename Size2, typename Compare>
+__host__ __device__
+thrust::pair<Size1,Size1>
+  balanced_path(RandomAccessIterator1 first1, Size1 n1,
+                RandomAccessIterator2 first2, Size1 n2,
+                Size1 diag,
+                Size2 levels,
+                Compare comp)
+{
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type T;
+
+  Size1 aIndex = balanced_path_detail::MergePath<false>(first1, n1, first2, n2, diag, comp);
+  Size1 bIndex = diag - aIndex;
+  
+  bool star = false;
+  if(bIndex < n2)
+  {
+    T x = first2[bIndex];
+    
+    // Search for the beginning of the duplicate run in both A and B.
+    Size1 aStart = balanced_path_detail::BiasedBinarySearch<false>(first1, aIndex, x, levels, comp);
+    Size1 bStart = balanced_path_detail::BiasedBinarySearch<false>(first2, bIndex, x, levels, comp);
+    
+    // The distance between x's merge path and its lower_bound is its rank.
+    // We add up the a and b ranks and evenly distribute them to
+    // get a stairstep path.
+    Size1 aRun = aIndex - aStart;
+    Size1 bRun = bIndex - bStart;
+    Size1 xCount = aRun + bRun;
+    
+    // Attempt to advance b and regress a.
+    Size1 bAdvance = thrust::max(xCount >> 1, xCount - aRun);
+    Size1 bEnd     = thrust::min<Size1>(n2, bStart + bAdvance + 1);
+    Size1 bRunEnd  = balanced_path_detail::BinarySearch<true>(first2 + bIndex, bEnd - bIndex, x, comp) + bIndex;
+    bRun = bRunEnd - bStart;
+    
+    bAdvance = thrust::min(bAdvance, bRun);
+    Size1 aAdvance = xCount - bAdvance;
+    
+    bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
+    aIndex = aStart + aAdvance;
+    
+    if(roundUp) star = true;
+  }
+
+  return thrust::make_pair(aIndex, (diag - aIndex) + star);
+}
+
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h b/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
new file mode 100644
index 0000000..2bbd658
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/allocator/temporary_allocator.h>
+#include <thrust/pair.h>
+#include <map>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy, template<typename> class BasePolicy>
+  class cached_temporary_allocator
+    : public BasePolicy<cached_temporary_allocator<DerivedPolicy,BasePolicy> >
+{
+  private:
+    typedef thrust::detail::temporary_allocator<char,DerivedPolicy> base_allocator_type;
+    typedef thrust::detail::allocator_traits<base_allocator_type>   traits;
+    typedef typename traits::pointer                                  allocator_pointer;
+    typedef std::multimap<std::ptrdiff_t, void*>                      free_blocks_type;
+    typedef std::map<void *, std::ptrdiff_t>                          allocated_blocks_type;
+
+    base_allocator_type   m_base_allocator;
+    free_blocks_type      free_blocks;
+    allocated_blocks_type allocated_blocks;
+
+    void free_all()
+    {
+      // deallocate all outstanding blocks in both lists
+      for(free_blocks_type::iterator i = free_blocks.begin();
+          i != free_blocks.end();
+          ++i)
+      {
+        // transform the pointer to allocator_pointer before calling deallocate
+        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->second)), i->first);
+      }
+
+      for(allocated_blocks_type::iterator i = allocated_blocks.begin();
+          i != allocated_blocks.end();
+          ++i)
+      {
+        // transform the pointer to allocator_pointer before calling deallocate
+        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->first)), i->second);
+      }
+    }
+
+  public:
+    cached_temporary_allocator(thrust::execution_policy<DerivedPolicy> &system)
+      : m_base_allocator(system)
+    {}
+
+    ~cached_temporary_allocator()
+    {
+      // free all allocations when cached_allocator goes out of scope
+      free_all();
+    }
+
+    void *allocate(std::ptrdiff_t num_bytes)
+    {
+      void *result = 0;
+
+      // search the cache for a free block
+      free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
+
+      if(free_block != free_blocks.end())
+      {
+        // get the pointer
+        result = free_block->second;
+
+        // erase from the free_blocks map
+        free_blocks.erase(free_block);
+      }
+      else
+      {
+        // no allocation of the right size exists
+        // create a new one with m_base_allocator
+        // allocate memory and convert to raw pointer
+        result = thrust::raw_pointer_cast(traits::allocate(m_base_allocator, num_bytes));
+      }
+
+      // insert the allocated pointer into the allocated_blocks map
+      allocated_blocks.insert(std::make_pair(result, num_bytes));
+
+      return result;
+    }
+
+    void deallocate(void *ptr)
+    {
+      // erase the allocated block from the allocated blocks map
+      allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
+      std::ptrdiff_t num_bytes = iter->second;
+      allocated_blocks.erase(iter);
+
+      // insert the block into the free blocks map
+      free_blocks.insert(std::make_pair(num_bytes, ptr));
+    }
+};
+
+
+// overload get_temporary_buffer on cached_temporary_allocator
+// note that we take a reference to cached_temporary_allocator
+template<typename T, typename DerivedPolicy, template<typename> class BasePolicy>
+  thrust::pair<T*, std::ptrdiff_t>
+    get_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, std::ptrdiff_t n)
+{
+  // ask the allocator for sizeof(T) * n bytes
+  T* result = reinterpret_cast<T*>(alloc.allocate(sizeof(T) * n));
+
+  // return the pointer and the number of elements allocated
+  return thrust::make_pair(result,n);
+}
+
+
+// overload return_temporary_buffer on cached_temporary_allocator
+// an overloaded return_temporary_buffer should always accompany
+// an overloaded get_temporary_buffer
+template<typename Pointer, typename DerivedPolicy, template<typename> class BasePolicy>
+  void return_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, Pointer p)
+{
+  // return the pointer to the allocator
+  alloc.deallocate(thrust::raw_pointer_cast(p));
+}
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/fast_scan.h b/compat/thrust/system/cuda/detail/detail/fast_scan.h
new file mode 100644
index 0000000..d095a4a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/fast_scan.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fast_scan.h
+ *  \brief A fast scan for primitive types.
+ */
+
+#pragma once
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace fast_scan
+{
+
+template <typename ExecutionPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator inclusive_scan(execution_policy<ExecutionPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator output,
+                              BinaryFunction binary_op);
+
+template <typename ExecutionPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename T,
+          typename BinaryFunction>
+OutputIterator exclusive_scan(execution_policy<ExecutionPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator output,
+                              const T init,
+                              BinaryFunction binary_op);
+
+} // end namespace fast_scan
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include "fast_scan.inl"
+
diff --git a/compat/thrust/system/cuda/detail/detail/fast_scan.inl b/compat/thrust/system/cuda/detail/detail/fast_scan.inl
new file mode 100644
index 0000000..b02763d
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/fast_scan.inl
@@ -0,0 +1,753 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/detail/temporary_array.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+
+#include <thrust/system/cuda/detail/reduce_intervals.h>
+#include <thrust/system/cuda/detail/synchronize.h>
+#include <thrust/system/cuda/detail/default_decomposition.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/uninitialized.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+
+namespace thrust
+{
+namespace detail
+{
+
+// forward declaration of temporary_array
+template<typename,typename> class temporary_array;
+
+} // end detail
+
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace fast_scan
+{
+namespace fast_scan_detail
+{
+
+
+// TODO tune this
+template <typename ValueType>
+struct inclusive_scan_block_size
+{
+  private:
+  static const unsigned int max_memory         = 16384 - 256 - 2 * sizeof(ValueType);
+  static const unsigned int max_block_size     = max_memory / sizeof(ValueType);
+  static const unsigned int default_block_size = 7 * 32;
+  static const unsigned int block_size         = (max_block_size < default_block_size) ? max_block_size : default_block_size;
+
+  public:
+  static const unsigned int pass1 = block_size;
+  static const unsigned int pass2 = block_size;
+  static const unsigned int pass3 = block_size;
+};
+
+// TODO tune this
+template <typename ValueType>
+struct exclusive_scan_block_size
+{
+  private:
+  static const unsigned int max_memory         = 16384 - 256 - 2 * sizeof(ValueType);
+  static const unsigned int max_block_size     = max_memory / sizeof(ValueType);
+  static const unsigned int default_block_size = 5 * 32;
+  static const unsigned int block_size         = (max_block_size < default_block_size) ? max_block_size : default_block_size;
+
+  public:
+  static const unsigned int pass1 = block_size;
+  static const unsigned int pass2 = block_size;
+  static const unsigned int pass3 = block_size;
+};
+
+
+template <unsigned int CTA_SIZE,
+          typename Context,
+          typename SharedArray,
+          typename BinaryFunction>
+__device__ __thrust_forceinline__
+void scan_block(Context context, SharedArray array, BinaryFunction binary_op)
+{
+    typedef typename thrust::iterator_value<SharedArray>::type T;
+
+    T val = array[context.thread_index()];
+
+    if (CTA_SIZE >    1) { if(context.thread_index() >=    1) { T tmp = array[context.thread_index() -    1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    2) { if(context.thread_index() >=    2) { T tmp = array[context.thread_index() -    2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    4) { if(context.thread_index() >=    4) { T tmp = array[context.thread_index() -    4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    8) { if(context.thread_index() >=    8) { T tmp = array[context.thread_index() -    8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   16) { if(context.thread_index() >=   16) { T tmp = array[context.thread_index() -   16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   32) { if(context.thread_index() >=   32) { T tmp = array[context.thread_index() -   32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   64) { if(context.thread_index() >=   64) { T tmp = array[context.thread_index() -   64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >  128) { if(context.thread_index() >=  128) { T tmp = array[context.thread_index() -  128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >  256) { if(context.thread_index() >=  256) { T tmp = array[context.thread_index() -  256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
+    if (CTA_SIZE >  512) { if(context.thread_index() >=  512) { T tmp = array[context.thread_index() -  512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
+    if (CTA_SIZE > 1024) { if(context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
+}
+
+template <unsigned int CTA_SIZE,
+          typename Context,
+          typename SharedArray,
+          typename BinaryFunction>
+__device__ __thrust_forceinline__
+void scan_block_n(Context context, SharedArray array, const unsigned int n, BinaryFunction binary_op)
+{
+    typedef typename thrust::iterator_value<SharedArray>::type T;
+
+    T val = array[context.thread_index()];
+
+    if (CTA_SIZE >    1) { if(context.thread_index() < n && context.thread_index() >=    1) { T tmp = array[context.thread_index() -    1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    2) { if(context.thread_index() < n && context.thread_index() >=    2) { T tmp = array[context.thread_index() -    2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    4) { if(context.thread_index() < n && context.thread_index() >=    4) { T tmp = array[context.thread_index() -    4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >    8) { if(context.thread_index() < n && context.thread_index() >=    8) { T tmp = array[context.thread_index() -    8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   16) { if(context.thread_index() < n && context.thread_index() >=   16) { T tmp = array[context.thread_index() -   16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   32) { if(context.thread_index() < n && context.thread_index() >=   32) { T tmp = array[context.thread_index() -   32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >   64) { if(context.thread_index() < n && context.thread_index() >=   64) { T tmp = array[context.thread_index() -   64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >  128) { if(context.thread_index() < n && context.thread_index() >=  128) { T tmp = array[context.thread_index() -  128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >  256) { if(context.thread_index() < n && context.thread_index() >=  256) { T tmp = array[context.thread_index() -  256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE >  512) { if(context.thread_index() < n && context.thread_index() >=  512) { T tmp = array[context.thread_index() -  512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+    if (CTA_SIZE > 1024) { if(context.thread_index() < n && context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
+}
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool FullBlock,
+          typename Context,
+          typename InputIterator,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void load_block(Context context,
+                const unsigned int n,
+                InputIterator input,
+                ValueType (&sdata)[K][CTA_SIZE + 1])
+{
+  for(unsigned int k = 0; k < K; k++)
+  {
+    const unsigned int offset = k*CTA_SIZE + context.thread_index();
+
+    if (FullBlock || offset < n)
+    {
+      InputIterator temp = input + offset;
+      sdata[offset % K][offset / K] = *temp;
+    }
+  }
+
+  context.barrier();
+}
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool Inclusive,
+          bool FullBlock,
+          typename Context,
+          typename OutputIterator,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void store_block(Context context,
+                 const unsigned int n,
+                 OutputIterator output,
+                 ValueType (&sdata)[K][CTA_SIZE + 1],
+                 ValueType& carry)
+{
+  if (Inclusive)
+  {
+    for(unsigned int k = 0; k < K; k++)
+    {
+      const unsigned int offset = k*CTA_SIZE + context.thread_index();
+
+      if (FullBlock || offset < n)
+      {
+        OutputIterator temp = output + offset;
+        *temp = sdata[offset % K][offset / K];
+      }
+    }   
+  }
+  else
+  {
+    for(unsigned int k = 0; k < K; k++)
+    {
+      const unsigned int offset = k*CTA_SIZE + context.thread_index();
+
+      if (FullBlock || offset < n)
+      {
+        OutputIterator temp = output + offset;
+        *temp = (offset == 0) ? carry : sdata[(offset - 1) % K][(offset - 1) / K];
+      }
+    }   
+  }
+}
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool FullBlock,
+          typename Context,
+          typename InputIterator,
+          typename BinaryFunction,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void upsweep_body(Context context,
+                  const unsigned int n,
+                  const bool carry_in,
+                  InputIterator input,
+                  BinaryFunction binary_op,
+                  ValueType (&sdata)[K][CTA_SIZE + 1],
+                  ValueType& carry)
+{
+  // read data
+  load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
+ 
+  // copy into local array
+  ValueType ldata[K];
+  for (unsigned int k = 0; k < K; k++)
+    ldata[k] = sdata[k][context.thread_index()];
+
+  // carry in
+  if (context.thread_index() == 0 && carry_in)
+  {
+    // XXX WAR sm_10 issue
+    ValueType tmp = carry;
+    ldata[0] = binary_op(tmp, ldata[0]);
+  }
+
+  // scan local values
+  for(unsigned int k = 1; k < K; k++)
+  {
+    const unsigned int offset = K * context.thread_index() + k;
+
+    if (FullBlock || offset < n)
+      ldata[k] = binary_op(ldata[k-1],ldata[k]);
+  }
+
+  sdata[K - 1][context.thread_index()] = ldata[K - 1];
+
+  context.barrier();
+
+  // second level scan
+  if (FullBlock && sizeof(ValueType) > 1) // TODO investigate why this WAR is necessary
+    scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op); 
+  else
+    scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
+
+  // store carry out
+  if (FullBlock)
+  {
+     if (context.thread_index() == CTA_SIZE - 1)
+        carry = sdata[K - 1][context.thread_index()];
+  }
+  else
+  {
+    if (context.thread_index() == (n - 1) / K)
+    {
+      ValueType sum;
+
+      for (unsigned int k = 0; k < K; k++)
+          if ((n - 1) % K == k)
+              sum = ldata[k];
+
+      if (context.thread_index() > 0)
+      {
+        // WAR sm_10 issue
+        ValueType tmp = sdata[K - 1][context.thread_index() - 1];
+        sum = binary_op(tmp, sum);
+      }
+
+      carry = sum;
+    }
+  }
+
+  context.barrier();
+}
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool Inclusive,
+          bool FullBlock,
+          typename Context,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void scan_body(Context context,
+               const unsigned int n,
+               const bool carry_in,
+               InputIterator input,
+               OutputIterator output,
+               BinaryFunction binary_op,
+               ValueType (&sdata)[K][CTA_SIZE + 1],
+               ValueType& carry)
+{
+  // read data
+  load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
+
+  // copy into local array
+  ValueType ldata[K];
+  for (unsigned int k = 0; k < K; k++)
+    ldata[k] = sdata[k][context.thread_index()];
+
+  // carry in
+  if (context.thread_index() == 0 && carry_in)
+  {
+    // XXX WAR sm_10 issue
+    ValueType tmp = carry;
+    ldata[0] = binary_op(tmp, ldata[0]);
+  }
+
+  // scan local values
+  for(unsigned int k = 1; k < K; k++)
+  {
+    const unsigned int offset = K * context.thread_index() + k;
+
+    if (FullBlock || offset < n)
+      ldata[k] = binary_op(ldata[k-1],ldata[k]);
+  }
+
+  sdata[K - 1][context.thread_index()] = ldata[K - 1];
+
+  context.barrier();
+
+  // second level scan
+  if (FullBlock)
+    scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op);
+  else
+    scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
+  
+  // update local values
+  if (context.thread_index() > 0)
+  {
+    ValueType left = sdata[K - 1][context.thread_index() - 1];
+
+    for(unsigned int k = 0; k < K; k++)
+    {
+      const unsigned int offset = K * context.thread_index() + k;
+
+      if (FullBlock || offset < n)
+        ldata[k] = binary_op(left, ldata[k]);
+    }
+  }
+
+  for (unsigned int k = 0; k < K; k++)
+    sdata[k][context.thread_index()] = ldata[k];
+
+  context.barrier();
+
+  // write data
+  store_block<CTA_SIZE, K, Inclusive, FullBlock>(context, n, output, sdata, carry);
+  
+  // store carry out
+  if (context.thread_index() == 0)
+  {
+    if (FullBlock)
+      carry = sdata[K - 1][CTA_SIZE - 1];
+    else
+      carry = sdata[(n - 1) % K][(n - 1) / K]; // note: this must come after the local update
+  }
+
+  context.barrier();
+}
+
+template <typename InputIterator,
+          typename ValueType,
+          typename BinaryFunction,
+          typename Decomposition,
+          typename Context>
+struct upsweep_intervals_closure
+{
+  InputIterator  input;
+  ValueType *    block_results; // TODO change this to ValueIterator
+  BinaryFunction binary_op;
+  Decomposition  decomp;
+  Context        context;
+  
+  typedef Context context_type;
+
+  upsweep_intervals_closure(InputIterator input,
+                            ValueType * block_results,
+                            BinaryFunction binary_op,
+                            Decomposition decomp,
+                            Context context = Context())
+    : input(input), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename Decomposition::index_type  IndexType;
+
+    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
+
+#if __CUDA_ARCH__ >= 200
+    const unsigned int SMEM = (48 * 1024);
+#else
+    const unsigned int SMEM = (16 * 1024) - 256;
+#endif
+    const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType)) / (sizeof(ValueType) * (CTA_SIZE + 1)));
+    const unsigned int K     = (MAX_K < 6) ? MAX_K : 6;
+
+    __shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata; // padded to avoid bank conflicts
+    
+    __shared__ uninitialized<ValueType> carry; // storage for carry out
+    if(context.thread_index() == 0) carry.construct();
+    
+    context.barrier();
+    
+    thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
+
+    IndexType base = interval.begin();
+
+    input += base;
+
+    const unsigned int unit_size = K * CTA_SIZE;
+
+    bool carry_in = false;
+
+    // process full units
+    while (base + unit_size <= interval.end())
+    {
+      const unsigned int n = unit_size;
+      upsweep_body<CTA_SIZE,K,true>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
+      base   += unit_size;
+      input  += unit_size;
+      carry_in = true;
+    }
+
+    // process partially full unit at end of input (if necessary)
+    if (base < interval.end())
+    {
+      const unsigned int n = interval.end() - base;
+      upsweep_body<CTA_SIZE,K,false>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
+    }
+
+    // write interval sum
+    if (context.thread_index() == 0)
+      block_results[context.block_index()] = carry;
+  }
+};
+
+
+template <bool Inclusive,
+          typename InputIterator,
+          typename OutputIterator,
+          typename ValueType,
+          typename BinaryFunction,
+          typename Decomposition,
+          typename Context>
+struct downsweep_intervals_closure
+{
+  InputIterator  input;
+  OutputIterator output;
+  ValueType *    block_results;
+  BinaryFunction binary_op;
+  Decomposition  decomp;
+  Context        context;
+
+  typedef Context context_type;
+
+  downsweep_intervals_closure(InputIterator input,
+                              OutputIterator output,
+                              ValueType * block_results,
+                              BinaryFunction binary_op,
+                              Decomposition decomp,
+                              Context context = Context())
+    : input(input), output(output), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename Decomposition::index_type IndexType;
+    
+    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
+
+#if __CUDA_ARCH__ >= 200
+    const unsigned int SMEM = (48 * 1024);
+#else
+    const unsigned int SMEM = (16 * 1024) - 256;
+#endif
+    const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType))/ (sizeof(ValueType) * (CTA_SIZE + 1)));
+    const unsigned int K     = (MAX_K < 6) ? MAX_K : 6;
+
+    __shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata;  // padded to avoid bank conflicts
+    
+    __shared__ uninitialized<ValueType> carry; // storage for carry in and carry out
+    if(context.thread_index() == 0) carry.construct();
+
+    context.barrier();
+
+    thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
+
+    IndexType base = interval.begin();
+
+    input  += base;
+    output += base;
+
+    const unsigned int unit_size = K * CTA_SIZE;
+
+    bool carry_in  = (Inclusive && context.block_index() == 0) ? false : true;
+
+    if (carry_in)
+    {
+      if (context.thread_index() == 0)
+        carry = block_results[context.block_index()];
+      context.barrier();
+    }
+
+    // process full units
+    while (base + unit_size <= interval.end())
+    {
+      const unsigned int n = unit_size;
+      scan_body<CTA_SIZE,K,Inclusive,true>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
+      base   += K * CTA_SIZE;
+      input  += K * CTA_SIZE;
+      output += K * CTA_SIZE;
+      carry_in = true;
+    }
+
+    // process partially full unit at end of input (if necessary)
+    if (base < interval.end())
+    {
+      const unsigned int n = interval.end() - base;
+      scan_body<CTA_SIZE,K,Inclusive,false>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
+    }
+  }
+};
+
+
+} // end namespace fast_scan_detail
+
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator output,
+                              BinaryFunction binary_op)
+{
+  using namespace fast_scan_detail;
+
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::has_result_type<BinaryFunction>::value,
+    thrust::detail::result_type<BinaryFunction>,
+    thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  typedef unsigned int                                                       IndexType;
+  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
+  typedef thrust::detail::temporary_array<ValueType,DerivedPolicy>           ValueArray;
+
+  if (first == last)
+      return output;
+
+  Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
+
+  ValueArray block_results(exec, decomp.size());
+  
+  // compute sum over each interval
+  if (thrust::detail::is_commutative<BinaryFunction>::value)
+  {
+    // use reduce_intervals for commutative operators
+    thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin(), binary_op, decomp);
+  }
+  else
+  {
+    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass1;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(first,
+                    thrust::raw_pointer_cast(&block_results[0]),
+                    binary_op,
+                    decomp);
+    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+  }
+
+  // second level inclusive scan of per-block results
+  {
+    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass2;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(thrust::raw_pointer_cast(&block_results[0]),
+                    thrust::raw_pointer_cast(&block_results[0]),
+                    thrust::raw_pointer_cast(&block_results[0]), // not used
+                    binary_op,
+                    Decomposition(decomp.size(), 1, 1));
+    detail::launch_closure(closure, 1, ThreadsPerBlock);
+  }
+  
+  // update intervals with result of second level scan
+  {
+    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass3;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef downsweep_intervals_closure<true,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(first,
+                    output,
+                    thrust::raw_pointer_cast(&block_results[0]) - 1, // shift block results
+                    binary_op,
+                    decomp);
+    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+  }
+  
+  return output + (last - first);
+}
+
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename T,
+          typename BinaryFunction>
+OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator output,
+                              const T init,
+                              BinaryFunction binary_op)
+{
+  using namespace fast_scan_detail;
+
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::has_result_type<BinaryFunction>::value,
+    thrust::detail::result_type<BinaryFunction>,
+    thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  typedef unsigned int                                                       IndexType;
+  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
+  typedef thrust::detail::temporary_array<ValueType,DerivedPolicy>           ValueArray;
+
+  if (first == last)
+      return output;
+
+  Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
+
+  ValueArray block_results(exec, decomp.size() + 1);
+  
+  // compute sum over each interval
+  if (thrust::detail::is_commutative<BinaryFunction>::value)
+  {
+    // use reduce_intervals for commutative operators
+    thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin() + 1, binary_op, decomp);
+  }
+  else
+  {
+    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass1;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(first,
+                    thrust::raw_pointer_cast(&block_results[0]) + 1,
+                    binary_op,
+                    decomp);
+    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+  }
+
+  // place init before per-block results
+  block_results[0] = init;
+  
+  // second level inclusive scan of per-block results
+  {
+    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass2;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(thrust::raw_pointer_cast(&block_results[0]),
+                    thrust::raw_pointer_cast(&block_results[0]),
+                    thrust::raw_pointer_cast(&block_results[0]), // not used
+                    binary_op,
+                    Decomposition(decomp.size() + 1, 1, 1));
+    detail::launch_closure(closure, 1, ThreadsPerBlock);
+  }
+  
+  // update intervals with result of second level scan
+  {
+    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass3;
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+
+    typedef downsweep_intervals_closure<false,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
+    Closure closure(first,
+                    output,
+                    thrust::raw_pointer_cast(&block_results[0]), // shift block results
+                    binary_op,
+                    decomp);
+    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+  }
+  
+  return output + (last - first);
+}
+
+
+} // end namespace fast_scan
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/compat/thrust/system/cuda/detail/detail/launch_calculator.h b/compat/thrust/system/cuda/detail/detail/launch_calculator.h
new file mode 100644
index 0000000..5126aa6
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/launch_calculator.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/system/cuda/detail/cuda_launch_config.h>
+#include <thrust/tuple.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template <typename Closure>
+class launch_calculator
+{
+  device_properties_t   properties;
+  function_attributes_t attributes;
+
+  public:
+  
+  launch_calculator(void);
+
+  launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes);
+
+  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(void) const;
+
+  template <typename UnaryFunction>
+  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(UnaryFunction block_size_to_smem_size) const;
+  
+  thrust::tuple<size_t,size_t,size_t> with_variable_block_size_available_smem(void) const;
+
+  private:
+
+  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
+   *  where num_threads_per_block is a valid block size for an instance of Closure
+   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
+   *  number of such blocks that can execute on a streaming multiprocessor at once.
+   */
+  thrust::pair<size_t, size_t> default_block_configuration() const;
+
+  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
+   *  where num_threads_per_block is a valid block size for an instance of Closure
+   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
+   *  number of such blocks that can execute on a streaming multiprocessor at once.
+   *
+   *  \param block_size_to_smem_size Mapping from num_threads_per_block to number of
+   *                                 dynamically-allocated bytes of shared memory
+   */
+  template<typename UnaryFunction>
+  thrust::pair<size_t, size_t> default_block_configuration(UnaryFunction block_size_to_smem_size) const;
+};
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/detail/launch_calculator.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/launch_calculator.inl b/compat/thrust/system/cuda/detail/detail/launch_calculator.inl
new file mode 100644
index 0000000..b851d5f
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/launch_calculator.inl
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// do not attempt to compile this file with any other compiler
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+#include <thrust/system/cuda/detail/cuda_launch_config.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template <typename Closure>
+launch_calculator<Closure>::launch_calculator(void)
+  : properties(device_properties()),
+    attributes(closure_attributes<Closure>())
+{}
+  
+template <typename Closure>
+launch_calculator<Closure>::launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes)
+  : properties(properties),
+    attributes(attributes)
+{}
+
+template <typename Closure>
+  template <typename UnaryFunction>
+thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(UnaryFunction block_size_to_smem_size) const
+{
+  // choose a block size
+  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties, block_size_to_smem_size);
+
+  // choose a subscription rate
+  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
+
+  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
+}
+
+
+template <typename Closure>
+thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(void) const
+{
+  // choose a block size
+  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties);
+
+  // choose a subscription rate
+  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
+
+  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
+}
+
+template <typename Closure>
+thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(void) const
+{
+  thrust::pair<size_t, size_t> config = default_block_configuration();
+  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, 0);
+}
+
+template <typename Closure>
+  template <typename UnaryFunction>
+thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(UnaryFunction block_size_to_smem_size) const
+{
+  thrust::pair<size_t, size_t> config = default_block_configuration(block_size_to_smem_size);
+  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, block_size_to_smem_size(config.first));
+}
+  
+template <typename Closure>
+thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size_available_smem(void) const
+{
+  thrust::pair<size_t, size_t> config = default_block_configuration();
+  size_t smem_per_block = proportional_smem_allocation(properties, attributes, config.second);
+  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, smem_per_block);
+}
+
+} // end detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
diff --git a/compat/thrust/system/cuda/detail/detail/launch_closure.h b/compat/thrust/system/cuda/detail/detail/launch_closure.h
new file mode 100644
index 0000000..c2e6c43
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/launch_closure.h
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/cuda/detail/cuda_launch_config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template <unsigned int _ThreadsPerBlock = 0,
+          unsigned int _BlocksPerMultiprocessor = 0>
+struct launch_bounds
+{
+  typedef thrust::detail::integral_constant<unsigned int, _ThreadsPerBlock>         ThreadsPerBlock;
+  typedef thrust::detail::integral_constant<unsigned int, _BlocksPerMultiprocessor> BlocksPerMultiprocessor;
+};
+
+struct thread_array : public launch_bounds<>
+{
+// CUDA built-in variables require nvcc
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return threadIdx.x; }
+  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return blockDim.x * gridDim.x; } 
+#else
+  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return 0; } 
+#endif // THRUST_DEVICE_COMPILER_NVCC
+};
+
+struct blocked_thread_array : public launch_bounds<>
+{
+// CUDA built-in variables require nvcc
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x; }
+  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return blockDim.x;  } 
+  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;  }
+  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;   }
+  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
+  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
+#else
+  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
+  __device__ __thrust_forceinline__ void         barrier(void)               {           }
+#endif // THRUST_DEVICE_COMPILER_NVCC
+};
+
+template <unsigned int _ThreadsPerBlock>
+struct statically_blocked_thread_array : public launch_bounds<_ThreadsPerBlock,1>
+{
+// CUDA built-in variables require nvcc
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x;      }
+  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return _ThreadsPerBlock; } // minor optimization
+  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;       }
+  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;        }
+  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
+  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
+#else
+  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
+  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
+  __device__ __thrust_forceinline__ void         barrier(void)               {           }
+#endif // THRUST_DEVICE_COMPILER_NVCC
+};
+
+template<typename Closure, typename Size1, typename Size2>
+  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size);
+
+template<typename Closure, typename Size1, typename Size2, typename Size3>
+  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size);
+
+/*! Returns a copy of the cudaFuncAttributes structure
+ *  that is associated with a given Closure
+ */
+template <typename Closure>
+function_attributes_t closure_attributes(void);
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/detail/launch_closure.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/launch_closure.inl b/compat/thrust/system/cuda/detail/detail/launch_closure.inl
new file mode 100644
index 0000000..ce39cfc
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/launch_closure.inl
@@ -0,0 +1,207 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+#include <thrust/system/cuda/detail/synchronize.h>
+#include <thrust/system/cuda/detail/detail/launch_calculator.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// XXX WAR circular inclusion problems with this forward declaration
+template<typename, typename> class temporary_array;
+
+} // end detail
+
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+template<typename Closure>
+__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
+void launch_closure_by_value(Closure f)
+{
+  f();
+}
+
+template<typename Closure>
+__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
+void launch_closure_by_pointer(const Closure *f)
+{
+  // copy to registers
+  Closure f_reg = *f;
+  f_reg();
+}
+#else
+template<typename Closure>
+void launch_closure_by_value(Closure) {}
+
+template<typename Closure>
+void launch_closure_by_pointer(const Closure *) {}
+
+#endif // THRUST_DEVICE_COMPILER_NVCC
+
+template<typename Closure,
+         bool launch_by_value = sizeof(Closure) <= 256>
+  struct closure_launcher_base
+{
+  typedef void (*launch_function_t)(Closure); 
+ 
+  static launch_function_t get_launch_function(void)
+  {
+    return launch_closure_by_value<Closure>;
+  }
+
+  template<typename Size1, typename Size2, typename Size3>
+  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
+  {
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+    if(num_blocks > 0)
+    {
+      launch_closure_by_value<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>(f);
+      synchronize_if_enabled("launch_closure_by_value");
+    }
+#endif // THRUST_DEVICE_COMPILER_NVCC
+  }
+}; // end closure_launcher_base
+
+
+template<typename Closure>
+  struct closure_launcher_base<Closure,false>
+{
+  typedef void (*launch_function_t)(const Closure *); 
+ 
+  static launch_function_t get_launch_function(void)
+  {
+    return launch_closure_by_pointer<Closure>;
+  }
+
+  template<typename Size1, typename Size2, typename Size3>
+  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
+  {
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+    if(num_blocks > 0)
+    {
+      // use temporary storage for the closure
+      // XXX use of cuda::tag is too specific here
+      thrust::cuda::tag cuda_tag;
+      thrust::host_system_tag host_tag;
+      thrust::detail::temporary_array<Closure,thrust::cuda::tag> closure_storage(cuda_tag, host_tag, &f, &f + 1);
+
+      // launch
+      detail::launch_closure_by_pointer<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>((&closure_storage[0]).get());
+      synchronize_if_enabled("launch_closure_by_pointer");
+    }
+#endif // THRUST_DEVICE_COMPILER_NVCC
+  }
+};
+
+
+template<typename Closure>
+  struct closure_launcher
+    : public closure_launcher_base<Closure>
+{
+  typedef closure_launcher_base<Closure> super_t;
+  
+  static inline const device_properties_t& device_properties(void)
+  {
+    return device_properties();
+  }
+  
+  static inline function_attributes_t function_attributes(void)
+  {
+    return thrust::system::cuda::detail::function_attributes(super_t::get_launch_function());
+  }
+
+  template<typename Size1, typename Size2, typename Size3>
+  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
+  {
+    super_t::launch(f,num_blocks,block_size,smem_size);
+  }
+};
+
+template<typename Closure, typename Size>
+  void launch_closure(Closure f, Size num_blocks)
+{
+  launch_calculator<Closure> calculator;
+  launch_closure(f, num_blocks, thrust::get<1>(calculator.with_variable_block_size()));
+} // end launch_closure()
+
+template<typename Closure, typename Size1, typename Size2>
+  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size)
+{
+  launch_closure(f, num_blocks, block_size, 0u);
+} // end launch_closure()
+
+template<typename Closure, typename Size1, typename Size2, typename Size3>
+  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
+{
+  closure_launcher<Closure>::launch(f, num_blocks, block_size, smem_size);
+} // end launch_closure()
+
+  
+template <typename Closure>
+function_attributes_t closure_attributes(void)
+{
+  typedef closure_launcher<Closure> Launcher;
+
+  // cache the result of function_attributes(), because it is slow
+  // only cache the first few devices
+  static const int max_num_devices                                  = 16;
+
+  static bool attributes_exist[max_num_devices]                     = {0};
+  static function_attributes_t function_attributes[max_num_devices] = {};
+
+  // XXX device_id ought to be an argument to this function
+  int device_id = current_device();
+
+  if(device_id >= max_num_devices)
+  {
+    return thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function());
+  }
+
+  if(!attributes_exist[device_id])
+  {
+    function_attributes[device_id] = thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function());
+
+    // disallow the compiler to move the write to attributes_exist[device_id]
+    // before the initialization of function_attributes[device_id]
+    __thrust_compiler_fence();
+
+    attributes_exist[device_id] = true;
+  }
+
+  return function_attributes[device_id];
+}
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/set_operation.h b/compat/thrust/system/cuda/detail/detail/set_operation.h
new file mode 100644
index 0000000..5475731
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/set_operation.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename Compare,
+         typename SetOperation>
+  RandomAccessIterator3 set_operation(execution_policy<DerivedPolicy> &exec,
+                                      RandomAccessIterator1 first1,
+                                      RandomAccessIterator1 last1,
+                                      RandomAccessIterator2 first2,
+                                      RandomAccessIterator2 last2,
+                                      RandomAccessIterator3 result,
+                                      Compare comp,
+                                      SetOperation set_op);
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/detail/set_operation.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/set_operation.inl b/compat/thrust/system/cuda/detail/detail/set_operation.inl
new file mode 100644
index 0000000..3f14379
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/set_operation.inl
@@ -0,0 +1,639 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/detail/set_operation.h>
+#include <thrust/system/cuda/detail/detail/balanced_path.h>
+#include <thrust/system/cuda/detail/block/inclusive_scan.h>
+#include <thrust/system/cuda/detail/block/exclusive_scan.h>
+#include <thrust/system/cuda/detail/block/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/scan.h>
+#include <thrust/pair.h>
+#include <thrust/detail/util/blocking.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace set_operation_detail
+{
+
+
+using thrust::system::cuda::detail::detail::statically_blocked_thread_array;
+using thrust::detail::uint16_t;
+using thrust::detail::uint32_t;
+
+
+// empirically determined on sm_20
+// value_types larger than this will fail to launch if placed in smem
+template<typename T>
+  struct stage_through_smem
+{
+  static const bool value = sizeof(T) <= 6 * sizeof(uint32_t);
+};
+
+
+// max_input_size <= 32
+template<typename Size, typename InputIterator, typename OutputIterator>
+inline __device__
+  OutputIterator serial_bounded_copy_if(Size max_input_size,
+                                        InputIterator first,
+                                        uint32_t mask,
+                                        OutputIterator result)
+{
+  for(Size i = 0; i < max_input_size; ++i, ++first)
+  {
+    if((1<<i) & mask)
+    {
+      *result = *first;
+      ++result;
+    }
+  }
+
+  return result;
+}
+
+
+template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
+  struct find_partition_offsets_functor
+{
+  Size partition_size;
+  InputIterator1 first1;
+  InputIterator2 first2;
+  Size n1, n2;
+  Compare comp;
+
+  find_partition_offsets_functor(Size partition_size,
+                                 InputIterator1 first1, InputIterator1 last1,
+                                 InputIterator2 first2, InputIterator2 last2,
+                                 Compare comp)
+    : partition_size(partition_size),
+      first1(first1), first2(first2),
+      n1(last1 - first1), n2(last2 - first2),
+      comp(comp)
+  {}
+
+  inline __host__ __device__
+  thrust::pair<Size,Size> operator()(Size i) const
+  {
+    Size diag = thrust::min(n1 + n2, i * partition_size);
+
+    // XXX the correctness of balanced_path depends critically on the ll suffix below
+    //     why???
+    return balanced_path(first1, n1, first2, n2, diag, 4ll, comp);
+  }
+};
+
+
+template<typename Size, typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
+  OutputIterator find_partition_offsets(thrust::cuda::execution_policy<DerivedPolicy> &exec,
+                                        Size num_partitions,
+                                        Size partition_size,
+                                        InputIterator1 first1, InputIterator1 last1,
+                                        InputIterator2 first2, InputIterator2 last2,
+                                        OutputIterator result,
+                                        Compare comp)
+{
+  find_partition_offsets_functor<Size,InputIterator1,InputIterator2,Compare> f(partition_size, first1, last1, first2, last2, comp);
+
+  return thrust::transform(exec,
+                           thrust::counting_iterator<Size>(0),
+                           thrust::counting_iterator<Size>(num_partitions),
+                           result,
+                           f);
+}
+
+
+namespace block
+{
+
+
+template<unsigned int block_size, typename T>
+inline __device__
+T right_neighbor(statically_blocked_thread_array<block_size> &ctx, const T &x, const T &boundary)
+{
+  // stage this shift to conserve smem
+  const unsigned int storage_size = block_size / 2;
+  __shared__ uninitialized_array<T,storage_size> shared;
+
+  T result = x;
+
+  unsigned int tid = ctx.thread_index();
+
+  if(0 < tid && tid <= storage_size)
+  {
+    shared[tid - 1] = x;
+  }
+
+  ctx.barrier();
+
+  if(tid < storage_size)
+  {
+    result = shared[tid];
+  }
+
+  ctx.barrier();
+  
+  tid -= storage_size;
+  if(0 < tid && tid <= storage_size)
+  {
+    shared[tid - 1] = x;
+  }
+  else if(tid == 0)
+  {
+    shared[storage_size-1] = boundary;
+  }
+
+  ctx.barrier();
+
+  if(tid < storage_size)
+  {
+    result = shared[tid];
+  }
+
+  ctx.barrier();
+
+  return result;
+}
+
+
+template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
+inline __device__
+  unsigned int bounded_count_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
+                                             InputIterator1 first1, uint16_t n1,
+                                             InputIterator2 first2, uint16_t n2,
+                                             Compare comp,
+                                             SetOperation set_op)
+{
+  unsigned int thread_idx = ctx.thread_index();
+
+  // find partition offsets
+  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
+  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
+  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
+
+  __shared__ uint16_t s_thread_output_size[block_size];
+
+  // work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above
+  s_thread_output_size[thread_idx] =
+    set_op.count(work_per_thread + 1,
+                 first1 + thread_input_begin.first,  first1 + thread_input_end.first,
+                 first2 + thread_input_begin.second, first2 + thread_input_end.second,
+                 comp);
+
+  ctx.barrier();
+
+  // reduce per-thread counts
+  thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size);
+  return s_thread_output_size[ctx.block_dimension() - 1];
+}
+
+
+inline __device__ int pop_count(unsigned int x)
+{
+// guard use of __popc from other compilers
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  return __popc(x);
+#else
+  return x;
+#endif
+}
+
+
+
+template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
+inline __device__
+  OutputIterator bounded_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
+                                         InputIterator1 first1, uint16_t n1,
+                                         InputIterator2 first2, uint16_t n2,
+                                         OutputIterator result,
+                                         Compare comp,
+                                         SetOperation set_op)
+{
+  unsigned int thread_idx = ctx.thread_index();
+  
+  // find partition offsets
+  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
+  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
+  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
+
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  // +1 to accomodate a "starred" partition returned from balanced_path above
+  uninitialized_array<value_type, work_per_thread + 1> sparse_result;
+  uint32_t active_mask =
+    set_op(work_per_thread + 1,
+           first1 + thread_input_begin.first,  first1 + thread_input_end.first,
+           first2 + thread_input_begin.second, first2 + thread_input_end.second,
+           sparse_result.begin(),
+           comp);
+
+  __shared__ uint16_t s_thread_output_size[block_size];
+  s_thread_output_size[thread_idx] = pop_count(active_mask);
+
+  ctx.barrier();
+
+  // scan to turn per-thread counts into output indices
+  uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u);
+
+  serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]);
+
+  ctx.barrier();
+
+  return result + block_output_size;
+}
+
+
+template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
+inline __device__
+  typename thrust::iterator_difference<InputIterator1>::type
+    count_set_operation(statically_blocked_thread_array<block_size> &ctx,
+                        InputIterator1 first1, InputIterator1 last1,
+                        InputIterator2 first2, InputIterator2 last2,
+                        Compare comp,
+                        SetOperation set_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
+
+  difference result = 0;
+
+  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
+
+  // iterate until the input is consumed
+  while(remaining_input_size.first + remaining_input_size.second > 0)
+  {
+    // find the end of this subpartition's input
+    // -1 to accomodate "starred" partitions
+    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
+    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
+    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
+  
+    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
+    if(stage_through_smem<value_type>::value)
+    {
+      // load the input into __shared__ storage
+      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
+  
+      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
+      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
+  
+      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
+                                                                                 s_input.begin(), subpartition_size.first,
+                                                                                 s_input_end1,    subpartition_size.second,
+                                                                                 comp,
+                                                                                 set_op);
+    }
+    else
+    {
+      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
+                                                                                 first1, subpartition_size.first,
+                                                                                 first2, subpartition_size.second,
+                                                                                 comp,
+                                                                                 set_op);
+    }
+
+    // advance input
+    first1 += subpartition_size.first;
+    first2 += subpartition_size.second;
+
+    // decrement remaining size
+    remaining_input_size.first  -= subpartition_size.first;
+    remaining_input_size.second -= subpartition_size.second;
+  }
+
+  return result;
+}
+
+
+template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
+inline __device__
+OutputIterator set_operation(statically_blocked_thread_array<block_size> &ctx,
+                             InputIterator1 first1, InputIterator1 last1,
+                             InputIterator2 first2, InputIterator2 last2,
+                             OutputIterator result,
+                             Compare comp,
+                             SetOperation set_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
+
+  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
+
+  // iterate until the input is consumed
+  while(remaining_input_size.first + remaining_input_size.second > 0)
+  {
+    // find the end of this subpartition's input
+    // -1 to accomodate "starred" partitions
+    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
+    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
+    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
+    
+    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
+    if(stage_through_smem<value_type>::value)
+    {
+      // load the input into __shared__ storage
+      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
+  
+      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
+      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
+  
+      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
+                                                                          s_input.begin(), subpartition_size.first,
+                                                                          s_input_end1,    subpartition_size.second,
+                                                                          result,
+                                                                          comp,
+                                                                          set_op);
+    }
+    else
+    {
+      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
+                                                                          first1, subpartition_size.first,
+                                                                          first2, subpartition_size.second,
+                                                                          result,
+                                                                          comp,
+                                                                          set_op);
+    }
+  
+    // advance input
+    first1 += subpartition_size.first;
+    first2 += subpartition_size.second;
+
+    // decrement remaining size
+    remaining_input_size.first  -= subpartition_size.first;
+    remaining_input_size.second -= subpartition_size.second;
+  }
+
+  return result;
+}
+
+
+} // end namespace block
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
+  inline __device__ void count_set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
+                                             InputIterator1                                      input_partition_offsets,
+                                             Size                                                num_partitions,
+                                             InputIterator2                                      first1,
+                                             InputIterator3                                      first2,
+                                             OutputIterator                                      result,
+                                             Compare                                             comp,
+                                             SetOperation                                        set_op)
+{
+  // consume partitions
+  for(Size partition_idx = ctx.block_index();
+      partition_idx < num_partitions;
+      partition_idx += ctx.grid_dimension())
+  {
+    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
+
+    // find the partition
+    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
+    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
+
+    // count the size of the set operation
+    difference count = block::count_set_operation<threads_per_block,work_per_thread>(ctx,
+                                                                                     first1 + block_input_begin.first,  first1 + block_input_end.first,
+                                                                                     first2 + block_input_begin.second, first2 + block_input_end.second,
+                                                                                     comp,
+                                                                                     set_op);
+
+    if(ctx.thread_index() == 0)
+    {
+      result[partition_idx] = count;
+    }
+  }
+}
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
+  struct count_set_operation_closure
+{
+  typedef statically_blocked_thread_array<threads_per_block> context_type;
+
+  InputIterator1 input_partition_offsets;
+  Size           num_partitions;
+  InputIterator2 first1;
+  InputIterator3 first2;
+  OutputIterator result;
+  Compare        comp;
+  SetOperation   set_op;
+
+  count_set_operation_closure(InputIterator1 input_partition_offsets,
+                              Size           num_partitions,
+                              InputIterator2 first1,
+                              InputIterator3 first2,
+                              OutputIterator result,
+                              Compare        comp,
+                              SetOperation   set_op)
+    : input_partition_offsets(input_partition_offsets),
+      num_partitions(num_partitions),
+      first1(first1),
+      first2(first2),
+      result(result),
+      comp(comp),
+      set_op(set_op)
+  {}
+
+  inline __device__ void operator()() const
+  {
+    context_type ctx;
+    count_set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op);
+  }
+};
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
+  count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation>
+    make_count_set_operation_closure(InputIterator1 input_partition_offsets,
+                                     Size           num_partitions,
+                                     InputIterator2 first1,
+                                     InputIterator3 first2,
+                                     OutputIterator result,
+                                     Compare        comp,
+                                     SetOperation   set_op)
+{
+  typedef count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation> result_type;
+  return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op);
+}
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
+inline __device__
+  void set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
+                     InputIterator1                                      input_partition_offsets,
+                     Size                                                num_partitions,
+                     InputIterator2                                      first1,
+                     InputIterator3                                      first2,
+                     InputIterator4                                      output_partition_offsets,
+                     OutputIterator                                      result,
+                     Compare                                             comp,
+                     SetOperation                                        set_op)
+{
+  // consume partitions
+  for(Size partition_idx = ctx.block_index();
+      partition_idx < num_partitions;
+      partition_idx += ctx.grid_dimension())
+  {
+    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
+
+    // find the partition
+    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
+    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
+
+    // do the set operation across the partition
+    block::set_operation<threads_per_block,work_per_thread>(ctx,
+                                                            first1 + block_input_begin.first,  first1 + block_input_end.first,
+                                                            first2 + block_input_begin.second, first2 + block_input_end.second,
+                                                            result + output_partition_offsets[partition_idx],
+                                                            comp,
+                                                            set_op);
+  }
+}
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
+  struct set_operation_closure
+{
+  typedef statically_blocked_thread_array<threads_per_block> context_type;
+
+  InputIterator1 input_partition_offsets;
+  Size           num_partitions;
+  InputIterator2 first1;
+  InputIterator3 first2;
+  InputIterator4 output_partition_offsets;
+  OutputIterator result;
+  Compare        comp;
+  SetOperation   set_op;
+
+  set_operation_closure(InputIterator1 input_partition_offsets,
+                        Size           num_partitions,
+                        InputIterator2 first1,
+                        InputIterator3 first2,
+                        InputIterator4 output_partition_offsets,
+                        OutputIterator result,
+                        Compare        comp,
+                        SetOperation   set_op)
+    : input_partition_offsets(input_partition_offsets),
+      num_partitions(num_partitions),
+      first1(first1),
+      first2(first2),
+      output_partition_offsets(output_partition_offsets),
+      result(result),
+      comp(comp),
+      set_op(set_op)
+  {}
+
+  inline __device__ void operator()() const
+  {
+    context_type ctx;
+    set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op);
+  }
+};
+
+
+template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
+  set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation>
+    make_set_operation_closure(InputIterator1 input_partition_offsets,
+                               Size           num_partitions,
+                               InputIterator2 first1,
+                               InputIterator3 first2,
+                               InputIterator4 output_partition_offsets,
+                               OutputIterator result,
+                               Compare        comp,
+                               SetOperation   set_op)
+{
+  typedef set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation> result_type;
+  return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op);
+}
+
+
+} // end namespace set_operation_detail
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
+  OutputIterator set_operation(thrust::cuda::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2, InputIterator2 last2,
+                               OutputIterator result,
+                               Compare comp,
+                               SetOperation set_op)
+{
+  using thrust::system::cuda::detail::device_properties;
+  using thrust::system::cuda::detail::detail::launch_closure;
+  namespace d = thrust::system::cuda::detail::detail::set_operation_detail;
+
+  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
+
+  const difference n1 = last1 - first1;
+  const difference n2 = last2 - first2;
+
+  // handle empty input
+  if(n1 == 0 && n2 == 0)
+  {
+    return result;
+  }
+
+  const thrust::detail::uint16_t work_per_thread   = 15;
+  const thrust::detail::uint16_t threads_per_block = 128;
+  const thrust::detail::uint16_t work_per_block    = threads_per_block * work_per_thread;
+
+  // -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one
+  const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1;
+  const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size);
+
+  // find input partition offsets
+  // +1 to handle the end of the input elegantly
+  thrust::detail::temporary_array<thrust::pair<difference,difference>, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1);
+  d::find_partition_offsets<difference>(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp);
+
+  const difference num_blocks = thrust::min<difference>(device_properties().maxGridSize[0], num_partitions);
+
+  // find output partition offsets
+  // +1 to store the total size of the total
+  thrust::detail::temporary_array<difference, DerivedPolicy> output_partition_offsets(0, exec, num_partitions + 1);
+  launch_closure(d::make_count_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op),
+                 num_blocks,
+                 threads_per_block);
+
+  // turn the output partition counts into offsets to output partitions
+  thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin());
+
+  // run the set op kernel
+  launch_closure(d::make_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op),
+                 num_blocks,
+                 threads_per_block);
+
+  return result + output_partition_offsets[num_partitions];
+}
+
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h
new file mode 100644
index 0000000..23f3254
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file stable_merge_sort_dev.h
+ *  \brief Defines the interface for a stable merge implementation on CUDA
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_merge_sort(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last,
+                       StrictWeakOrdering comp);
+    
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 keys_begin,
+                              RandomAccessIterator1 keys_end,
+                              RandomAccessIterator2 values_begin,
+                              StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/detail/stable_merge_sort.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl
new file mode 100644
index 0000000..0c69803
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl
@@ -0,0 +1,1103 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file stable_merge_sort.inl
+ *  \brief Inline file for stable_merge_sort.h.
+ *  \note This algorithm is based on the one described
+ *        in "Designing Efficient Sorting Algorithms for
+ *        Manycore GPUs", by Satish, Harris, and Garland.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/functional.h>
+#include <thrust/detail/copy.h>
+
+#include <thrust/detail/function.h>
+
+#include <thrust/detail/mpl/math.h> // for log2<N>
+#include <thrust/detail/util/blocking.h>
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/detail/detail/stable_sort_by_count.h>
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/uninitialized.h>
+#include <thrust/system/cuda/detail/detail/cached_temporary_allocator.h>
+#include <thrust/system/cuda/detail/block/merge.h>
+#include <thrust/system/cuda/detail/block/copy.h>
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/gather.h>
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace stable_merge_sort_detail
+{
+
+
+template<unsigned int log_block_size, typename Key, typename Value>
+  struct is_block_size_valid
+{
+  // assume sm_10 limits
+  static const unsigned int max_num_smem_bytes = 16384;
+
+  // CUDA steals 256 for itself for kernel parms
+  static const unsigned int num_reserved_smem_bytes = 256;
+
+  // the number of bytes available to our kernels
+  static const unsigned int num_available_smem_bytes = max_num_smem_bytes - num_reserved_smem_bytes;
+
+  // merge_small_tiles_by_key_closure is the hungriest kernel
+  // the block_size it uses is 2x the size of all the other kernels
+  // this merge_small_tiles_by_key_closure's smem requirements:
+  //   2 * block_size_x2 * sizeof(Key)
+  // + 2 * block_size_x2 * sizeof(Key)
+  // + 2 * block_size_x2 * sizeof(Value)
+  // ================================
+  // 4 * (block_size) * (2 * sizeof(Key) + sizeof(Value))
+  static const unsigned int num_needed_smem_bytes = 4 * (1 << log_block_size) * (2 * sizeof(Key) + sizeof(Value));
+
+  static const bool value = num_needed_smem_bytes <= num_available_smem_bytes;
+};
+
+
+
+// choose a (log) block_size to use for our kernels
+template<unsigned int log_preferred_block_size, typename Key, typename Value>
+  struct select_log_block_size
+    : thrust::detail::eval_if<
+        is_block_size_valid<log_preferred_block_size, Key, Value>::value,
+        thrust::detail::integral_constant<unsigned int, log_preferred_block_size>,
+        select_log_block_size<log_preferred_block_size - 1, Key, Value>
+      >::type
+{};
+
+
+// don't recurse lower than block_size < 128
+template<typename Key, typename Value>
+  struct select_log_block_size<6, Key, Value>
+{
+  // no block size exists which can satisfy the storage demands
+};
+
+
+template<typename Key, typename Value>
+  struct block_size
+{
+  // prefer block_size == 512, go lower if we need to
+  static const unsigned int value = 1 << select_log_block_size<8, Key, Value>::value;
+};
+
+
+template <typename Size>
+inline unsigned int max_grid_size(Size block_size)
+{
+  const device_properties_t& properties = device_properties();
+
+  const unsigned int max_threads = properties.maxThreadsPerMultiProcessor * properties.multiProcessorCount;
+  const unsigned int max_blocks  = properties.maxGridSize[0];
+  
+  return std::min<unsigned int>(max_blocks, 3 * max_threads / block_size);
+} // end max_grid_size()
+
+
+// Base case for the merge algorithm: merges data where tile_size <= block_size. 
+// Works by loading two or more tiles into shared memory and doing a binary search.
+template<unsigned int block_size,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering,
+         typename Context>
+struct merge_small_tiles_by_key_closure
+{
+  typedef Context context_type;
+
+  RandomAccessIterator1 keys_first;
+  RandomAccessIterator2 values_first;
+  const unsigned int n;
+  const unsigned int log_tile_size;
+  RandomAccessIterator3 keys_result;
+  RandomAccessIterator4 values_result;
+  StrictWeakOrdering comp;
+  context_type context;
+
+  // these members are derivable from block_size, n, and log_tile_size
+  unsigned int index_of_last_block;
+  unsigned int index_of_last_tile_in_last_block;
+  unsigned int size_of_last_tile;
+
+  merge_small_tiles_by_key_closure
+    (RandomAccessIterator1 keys_first,
+     RandomAccessIterator2 values_first,
+     const unsigned int n,
+     const unsigned int log_tile_size,
+     RandomAccessIterator3 keys_result,
+     RandomAccessIterator4 values_result,
+     StrictWeakOrdering comp,
+     Context context = Context())
+    : keys_first(keys_first), values_first(values_first),
+      n(n), 
+      log_tile_size(log_tile_size),
+      keys_result(keys_result), values_result(values_result),
+      comp(comp),
+      context(context)
+  {
+    // compute the number of tiles, including a possible partial tile
+    unsigned int tile_size = 1 << log_tile_size;
+    unsigned int num_tiles = thrust::detail::util::divide_ri(n, tile_size);
+    unsigned int partial_tile_size = n % tile_size;
+
+    // compute the number of logical thread blocks, including a possible partial block
+    unsigned int tiles_per_block = block_size / tile_size;
+    unsigned int num_blocks = thrust::detail::util::divide_ri(num_tiles, tiles_per_block);
+    unsigned int partial_block_size = num_tiles % tiles_per_block;
+
+    // compute the number of tiles in the last block, which might be of partial size
+    unsigned int number_of_tiles_in_last_block = partial_block_size ? partial_block_size : tiles_per_block;
+
+    size_of_last_tile = partial_tile_size ? partial_tile_size : tile_size;
+    index_of_last_tile_in_last_block = number_of_tiles_in_last_block - 1;
+    index_of_last_block = num_blocks - 1;
+  }
+
+  unsigned int grid_size() const
+  {
+    const unsigned int max_num_blocks = max_grid_size(block_size);
+    const unsigned int num_logical_blocks = index_of_last_block + 1;
+    return thrust::min<unsigned int>(num_logical_blocks, max_num_blocks);
+  }
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename iterator_value<RandomAccessIterator3>::type KeyType;
+    typedef typename iterator_value<RandomAccessIterator4>::type ValueType;
+
+    // load (2*block_size) elements into shared memory. These (2*block_size) elements belong to (2*block_size)/tile_size different tiles.
+    __shared__ uninitialized_array<KeyType, 2 * block_size>   key;
+    __shared__ uninitialized_array<KeyType, 2 * block_size>   outkey;
+    __shared__ uninitialized_array<ValueType, 2 * block_size> outvalue;
+
+    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
+
+    unsigned int block_idx = context.block_index();
+    
+    // the global index of this task
+    unsigned int i = context.thread_index() + context.block_index() * context.block_dimension();
+
+    // advance iterators
+    keys_first    += i;
+    values_first  += i;
+    keys_result   += i;
+    values_result += i;
+
+    for(;
+        block_idx <= index_of_last_block;
+        block_idx += context.grid_dimension(), i += grid_size, keys_first += grid_size, values_first += grid_size, keys_result += grid_size, values_result += grid_size)
+    {
+      KeyType my_key;
+      
+      // copy over inputs to shared memory
+      if(i < n)
+      {
+        key[context.thread_index()] = my_key = *keys_first;
+      } // end if
+      
+      // the tile to which the element belongs
+      unsigned int tile_index = context.thread_index()>>log_tile_size;
+
+      // figure out the index and size of the other tile
+      unsigned int other_tile_index = tile_index^1;
+      unsigned int other_tile_size = (1<<log_tile_size);
+
+      // if the other tile is the final tile, it is potentially
+      // smaller than the rest
+      if(block_idx == index_of_last_block
+         && other_tile_index == index_of_last_tile_in_last_block)
+      {
+        other_tile_size = size_of_last_tile;
+      } // end if
+      
+      // figure out where the other tile begins in shared memory
+      KeyType *other = key.data() + (other_tile_index<<log_tile_size);
+
+      context.barrier();
+      if(i < n)
+      {
+        // to compute the rank of my element in the merged sequence
+        // add the rank of the element in the other tile
+        // plus the rank of the element in this tile
+        // the computation for the rank of the element in this tile 
+        // differs depending on if we're in the odd or even tile
+        unsigned int rank;
+        if(tile_index & 1)
+        {
+          rank = thrust::system::detail::generic::scalar::upper_bound_n(other, other_tile_size, my_key, comp) - other;
+          rank += context.thread_index() - (1<<log_tile_size);
+        }
+        else
+        {
+          rank = thrust::system::detail::generic::scalar::lower_bound_n(other, other_tile_size, my_key, comp) - other;
+          rank += context.thread_index();
+        }
+
+        // store my key and value to the output arrays in smem
+        outkey[rank] = my_key;
+        outvalue[rank] = *values_first;
+      } // end if
+      context.barrier();
+      
+      if(i < n)
+      {
+        // coalesced writes to global memory
+        *keys_result   = outkey[context.thread_index()];
+        *values_result = outvalue[context.thread_index()];
+      } // end if
+      context.barrier();
+    } // end for
+  } // end operator()
+}; // merge_small_tiles_by_key_closure
+
+
+template<unsigned int stride>
+  class static_strided_integer_range
+{
+  // XXX cudafe doesn't like this private for some reason
+  //private:
+  public:
+    typedef typename thrust::counting_iterator<unsigned int> counting_iterator;
+
+    struct stride_functor
+      : public thrust::unary_function<unsigned int,unsigned int>
+    {
+      inline __host__ __device__
+      unsigned int operator()(unsigned int i) const
+      {
+        return stride * i;
+      }
+    };
+
+  public:
+    typedef typename thrust::transform_iterator<stride_functor, counting_iterator> iterator;
+
+    static_strided_integer_range(unsigned int num_strides)
+      : m_begin(iterator(counting_iterator(0), stride_functor())),
+        m_end(iterator(counting_iterator(num_strides), stride_functor()))
+    {}
+
+    iterator begin() const
+    {
+      return m_begin;
+    }
+
+    iterator end() const
+    {
+      return m_end;
+    }
+
+  private:
+    iterator m_begin, m_end;
+};
+
+
+///////////////////// Find the rank of each extracted element in both arrays ////////////////////////////////////////
+///////////////////// This breaks up the array into independent segments to merge ////////////////////////////////////////
+// Inputs: d_splitters, d_splittes_pos: the merged array of splitters with corresponding positions.
+//		   d_srcData: input data, datasize: number of entries in d_srcData
+//		   N_SPLITTERS the number of splitters, log_blocksize: log of the size of each block of sorted data
+//		   log_num_merged_splitters_per_tile = log of the number of merged splitters. ( = log_blocksize - 7). 
+// Output: d_rank1, d_rank2: ranks of each splitter in d_splitters in the block to which it belongs
+//		   (say i) and its corresponding block (block i+1).
+template<unsigned int block_size,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering,
+         typename Context>
+struct rank_splitters_closure
+{
+  typedef Context context_type;
+
+  static const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
+
+  RandomAccessIterator1 splitters_first;
+  RandomAccessIterator2 splitters_pos_first;
+  RandomAccessIterator3 keys_first;
+  RandomAccessIterator4 ranks_result1;
+  RandomAccessIterator4 ranks_result2;
+  unsigned int num_splitters;
+  unsigned int num_keys;
+  unsigned int log_tile_size;
+  thrust::detail::device_function<
+    StrictWeakOrdering,
+    bool
+  > comp;
+  context_type context;
+
+  // this member is derivable from those received in the constructor
+  unsigned int log_num_merged_splitters_per_tile;
+
+  rank_splitters_closure(RandomAccessIterator1 splitters_first,
+                         RandomAccessIterator2 splitters_pos_first, 
+                         RandomAccessIterator3 keys_first,
+                         unsigned int num_splitters,
+                         unsigned int num_keys, 
+                         unsigned int log_tile_size, 
+                         RandomAccessIterator4 ranks_result1,
+                         RandomAccessIterator4 ranks_result2, 
+                         StrictWeakOrdering comp,
+                         context_type context = context_type())
+    : splitters_first(splitters_first), splitters_pos_first(splitters_pos_first),
+      keys_first(keys_first),
+      ranks_result1(ranks_result1), ranks_result2(ranks_result2),
+      num_splitters(num_splitters), num_keys(num_keys),
+      log_tile_size(log_tile_size),
+      comp(comp), context(context)
+  {
+    // the number of splitters in each tile before merging
+    const unsigned int log_num_splitters_per_tile = log_tile_size - log_block_size;
+
+    // the number of splitters in each merged tile
+    log_num_merged_splitters_per_tile = log_num_splitters_per_tile + 1;
+  }
+
+  inline unsigned int grid_size() const
+  {
+    unsigned int num_blocks = num_splitters / block_size;
+    if(num_splitters % block_size) ++num_blocks;
+
+    // compute the maximum number of block_size we can launch on this arch
+    const unsigned int max_num_blocks = max_grid_size(block_size);
+
+    return min<unsigned int>(num_blocks, max_num_blocks);
+  }
+
+  /*! this member function returns the index of the (odd,even) block pair
+   *  that the splitter of interest belongs to
+   *  \param splitter_idx The index of the splitter in the splitters list
+   *  \return The global index of the (odd,even) block pair
+   */
+  __device__ __thrust_forceinline__
+  unsigned int block_pair_idx(unsigned int splitter_idx) const
+  {
+    return splitter_idx >> log_num_merged_splitters_per_tile;
+  }
+
+  /*! This member function returns the end of the search range in the other tile in
+   *  which the splitter of interest needs to be ranked.
+   *  \param splitter_idx The index of the splitter in the splitters array
+   *  \param splitter_global_idx The index of the splitter in the global array of elements
+   *  \param tile_idx The index of the tile to which the splitter belongs.
+   *  \return The half-open interval in the other tile in which the splitter needs to be ranked.
+   *          [first_index_to_search, size_of_interval)
+   */
+  __device__ __thrust_forceinline__
+  thrust::pair<unsigned int,unsigned int> search_interval(unsigned int splitter_idx, unsigned int splitter_global_idx, unsigned int tile_idx) const
+  {
+    // We want to compute the ranks of the splitter in d_srcData1 and d_srcData2
+    // for instance, if the splitter belongs to d_srcData1, then 
+    // (1) the rank in d_srcData1 is simply given by its splitter_global_idx
+    // (2) to find the rank in d_srcData2, we first find the block in d_srcData2 where inp appears.
+    //     We do this by noting that we have already merged/sorted splitters, and thus the rank
+    //     of inp in the elements of d_srcData2 that are present in splitters is given by 
+    //        position of inp in d_splitters - rank of inp in elements of d_srcData1 in splitters
+    //        = i - splitter_global_idx
+    //     This also gives us the block of d_srcData2 that the splitter belongs in, since we have one
+    //     element in splitters per block of d_srcData2.
+    
+    //     We now perform a binary search over this block of d_srcData2 to find the rank of inp in d_srcData2.
+    //     start and end are the start and end indices of this block in d_srcData2, forming the bounds of the binary search.
+    //     Note that this binary search is in global memory with uncoalesced loads. However, we only find the ranks 
+    //     of a small set of elements, one per splitter: thus it is not the performance bottleneck.
+    
+    // the local index of the splitter within the (odd, even) block pair.
+    const unsigned int splitter_block_pair_idx = splitter_idx - (block_pair_idx(splitter_idx)<<log_num_merged_splitters_per_tile);
+
+    // the index of the splitter within its tile
+    const unsigned int splitter_tile_idx = splitter_global_idx - (tile_idx<<log_tile_size);
+
+    // the index of the splitter's block within its tile
+    const unsigned int block_tile_idx = splitter_tile_idx >> log_block_size;
+    
+    // find the end of the search range in the other tile
+    unsigned int end = (( splitter_block_pair_idx - block_tile_idx) << log_block_size);
+
+    // begin by assuming the search range is the size of a full block
+    unsigned int other_block_size = block_size;
+
+    // the index of the other tile can be found with
+    const unsigned int other_tile_idx = tile_idx ^ 1;
+    
+    // the size of the other tile can be less than tile_size if the it is the last tile.
+    unsigned int other_tile_size = min<unsigned int>(1 << log_tile_size, num_keys - (other_tile_idx<<log_tile_size));
+
+    if(end > other_tile_size)
+    {
+      // the other block has partial size
+      end = other_tile_size;
+      other_block_size = num_keys % block_size;
+    }
+    else if(end == 0)
+    {
+      // when the search range is empty
+      // the other_block_size is 0
+      other_block_size = 0;
+    }
+
+    // the search range begins other_block_size elements before the end
+    unsigned int start = end - other_block_size;
+
+    return thrust::make_pair(start,other_block_size);
+  }
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename iterator_value<RandomAccessIterator1>::type KeyType;
+    typedef typename iterator_value<RandomAccessIterator2>::type IndexType;
+  
+    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
+  
+    unsigned int splitter_idx = context.thread_index() + context.block_index() * context.block_dimension();
+  
+    // advance iterators
+    splitters_first     += splitter_idx;
+    splitters_pos_first += splitter_idx;
+    ranks_result1       += splitter_idx;
+    ranks_result2       += splitter_idx;
+    
+    for(;
+        splitter_idx < num_splitters;
+        splitter_idx += grid_size, splitters_first += grid_size, splitters_pos_first += grid_size, ranks_result1 += grid_size, ranks_result2 += grid_size)
+    {
+      // the index of the splitter within the global array of elements
+      IndexType splitter_global_idx = *splitters_pos_first;
+
+      // the tile to which the splitter belongs.
+      unsigned int tile_idx = (splitter_global_idx >> log_tile_size);
+      
+      // the index of the "other" tile which which tile_idx must be merged.
+      unsigned int other_tile_idx = tile_idx^1;
+
+      // compute the interval in the other tile to search
+      unsigned int start, n;
+      thrust::tie(start,n) = search_interval(splitter_idx, splitter_global_idx, tile_idx);
+
+      // point to the beginning of the other tile
+      RandomAccessIterator3 other_tile_begin = keys_first + (other_tile_idx<<log_tile_size);
+
+      // offset the pointer to the other tile by the search range's offset
+      RandomAccessIterator3 search_range_begin = other_tile_begin + start;
+      
+      // find the rank of our splitter in the other tile
+      KeyType splitter = *splitters_first;
+
+      // the index of the splitter within its tile
+      // this is one of the output ranks
+      const unsigned int splitter_tile_idx = splitter_global_idx - (tile_idx<<log_tile_size);
+
+      // branch depending on whether or not our splitter is in the odd tile
+      if(tile_idx & 1)
+      {
+        unsigned int result = thrust::system::detail::generic::scalar::upper_bound_n(search_range_begin, n, splitter, comp) - search_range_begin;
+
+        *ranks_result1 = start + result;
+
+        *ranks_result2 = splitter_tile_idx;
+      } // end if
+      else
+      {
+        unsigned int result = thrust::system::detail::generic::scalar::lower_bound_n(search_range_begin, n, splitter, comp) - search_range_begin;
+
+        *ranks_result1 = splitter_tile_idx;
+
+        *ranks_result2 = start + result;
+      } // end else
+    } // end for
+  } // end operator()
+}; // rank_splitters_closure
+
+
+template<unsigned int block_size,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+  void rank_splitters(RandomAccessIterator1 splitters_first,
+                      RandomAccessIterator1 splitters_last,
+                      RandomAccessIterator2 splitter_positions_first,
+                      RandomAccessIterator3 keys_first,
+                      RandomAccessIterator3 keys_last,
+                      size_t log_tile_size,
+                      RandomAccessIterator4 ranks_result1,
+                      RandomAccessIterator4 ranks_result2,
+                      StrictWeakOrdering comp)
+{
+  typedef rank_splitters_closure<
+    block_size,
+    RandomAccessIterator1,
+    RandomAccessIterator2,
+    RandomAccessIterator3,
+    RandomAccessIterator4,
+    StrictWeakOrdering,
+    detail::statically_blocked_thread_array<block_size>
+  > Closure;
+
+  Closure closure(splitters_first,
+                  splitter_positions_first,
+                  keys_first,
+                  splitters_last - splitters_first,
+                  keys_last - keys_first,
+                  log_tile_size,
+                  ranks_result1,
+                  ranks_result2,
+                  comp);
+
+  detail::launch_closure(closure, closure.grid_size(), block_size);
+}
+
+
+template<typename Context,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4>
+__device__
+  void copy_n(Context context,
+              RandomAccessIterator1 first1,
+              RandomAccessIterator2 first2,
+              Size n,
+              RandomAccessIterator3 result1,
+              RandomAccessIterator4 result2)
+{
+  for(Size i = context.thread_index();
+      i < n;
+      i += context.block_dimension())
+  {
+    result1[i] = first1[i];
+    result2[i] = first2[i];
+  }
+}
+
+
+///////////////////// MERGE TWO INDEPENDENT SEGMENTS USING BINARY SEARCH IN SHARED MEMORY ////////////////////////////////////////
+// NOTE: This is the most compute-intensive part of the algorithm. 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thread block i merges entries between rank[i] and rank[i+1]. These can be independently
+// merged and concatenated, as noted above. 
+// Each thread in the thread block i does a binary search of one element between rank[i] -> rank[i+1] in the 
+// other array. 
+
+// Inputs: srcdatakey, value: inputs
+//         log_blocksize, log_num_merged_splitters_per_tile: as in previous functions
+// Outputs: resultdatakey, resultdatavalue: output merged arrays are written here.
+template<unsigned int block_size,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename RandomAccessIterator5,
+         typename RandomAccessIterator6,
+         typename StrictWeakOrdering,
+         typename Context>
+struct merge_subtiles_by_key_closure
+{	
+  typedef Context context_type;
+  static const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
+
+  RandomAccessIterator1 keys_first;
+  RandomAccessIterator2 values_first;
+  unsigned int n;
+  RandomAccessIterator3 ranks_first1;
+  RandomAccessIterator4 ranks_first2; 
+  const unsigned int tile_size;
+  const unsigned int num_splitters;
+  RandomAccessIterator5 keys_result;
+  RandomAccessIterator6 values_result;
+  StrictWeakOrdering comp;
+  Context context;
+
+  // this member is derivable from the constructor parameters
+  unsigned int log_num_merged_splitters_per_tile;
+
+  merge_subtiles_by_key_closure
+    (RandomAccessIterator1 keys_first,
+     RandomAccessIterator2 values_first,
+     unsigned int n, 
+     RandomAccessIterator3 ranks_first1,
+     RandomAccessIterator4 ranks_first2, 
+     const unsigned int log_tile_size, 
+     const unsigned int num_splitters,
+     RandomAccessIterator5 keys_result,
+     RandomAccessIterator6 values_result,
+     StrictWeakOrdering comp,
+     Context context = Context())
+    : keys_first(keys_first), values_first(values_first), n(n),
+      ranks_first1(ranks_first1), ranks_first2(ranks_first2),
+      tile_size(1 << log_tile_size),
+      num_splitters(num_splitters),
+      keys_result(keys_result), values_result(values_result),
+      comp(comp), context(context)
+  {
+    const unsigned int log_num_splitters_per_tile = log_tile_size - log_block_size;
+    log_num_merged_splitters_per_tile = log_num_splitters_per_tile + 1;
+  }
+
+  unsigned int grid_size() const
+  {
+    const unsigned int max_num_blocks = max_grid_size(block_size);
+    return thrust::min<unsigned int>(num_splitters, max_num_blocks);
+  }
+
+  __device__ __thrust_forceinline__
+  unsigned int even_offset(unsigned int oddeven_blockid) const
+  {
+    return oddeven_blockid << (log_num_merged_splitters_per_tile + log_block_size);
+  }
+
+  __device__ __thrust_forceinline__
+  void get_partition(unsigned int partition_idx, unsigned int oddeven_blockid,
+                     unsigned int &rank1, unsigned int &size1,
+                     unsigned int &rank2, unsigned int &size2) const
+  {
+    // XXX this logic would be much improved if we were guaranteed that there was 
+    //     an element at ranks_first[1]
+    // XXX we could eliminate the need for local_blockIdx, log_num_merged_splitters_per_block, tile_size, and n
+    
+    // the index of the merged splitter within the splitters for the odd-even block pair.
+    unsigned int local_blockIdx = partition_idx - (oddeven_blockid<<log_num_merged_splitters_per_tile);
+
+    rank1 = *ranks_first1;
+    rank2 = *ranks_first2;
+  
+    // get the rank of the next splitter if we aren't processing the very last splitter of a partially full tile
+    // or if we aren't processing the last splitter in our tile
+    if((partition_idx == num_splitters - 1) || (local_blockIdx == ((1<<log_num_merged_splitters_per_tile)-1)))
+    {
+      // we're at the end
+      size1 = size2 = tile_size;
+    } // end if
+    else
+    {
+      // dereference the rank of the *next* splitter
+      size1 = ranks_first1[1];
+      size2 = ranks_first2[1];
+    } // end else
+    
+    // Adjust size2 to account for the last block possibly not being full.
+    // check if size2 would fall off the end of the array
+    if((even_offset(oddeven_blockid) + tile_size + size2) > n)
+    {
+      size2 = n - tile_size - even_offset(oddeven_blockid);
+    } // end if
+  
+    // measure each array relative to its beginning
+    size1 -= rank1;
+    size2 -= rank2;
+  }
+
+  template<typename KeyType, typename ValueType>
+  __device__ __thrust_forceinline__
+  void do_it(KeyType *s_keys, ValueType *s_values)
+  {
+    // advance iterators
+    unsigned int i = context.block_index();
+    ranks_first1 += i;
+    ranks_first2 += i;
+    
+    // Thread Block i merges the sub-block associated with splitter i: rank[i] -> rank[i+1] in a particular odd-even block pair.
+    for(;
+        i < num_splitters;
+        i += context.grid_dimension(), ranks_first1 += context.grid_dimension(), ranks_first2 += context.grid_dimension())
+    {
+      // the (odd, even) block pair that the splitter belongs to.
+      unsigned int oddeven_blockid = i >> log_num_merged_splitters_per_tile;
+      
+      // start1 & start2 store rank[i] and rank[i+1] indices in arrays 1 and 2.
+      // size1 & size2 store the number of of elements between rank[i] & rank[i+1] in arrays 1 & 2.
+      unsigned int rank1, rank2, size1, size2;
+      get_partition(i, oddeven_blockid, rank1, size1, rank2, size2);
+  
+      // find where the odd,even arrays begin
+      RandomAccessIterator1 even_keys_first = keys_first + even_offset(oddeven_blockid);
+      RandomAccessIterator1 odd_keys_first  = even_keys_first + tile_size;
+  
+      RandomAccessIterator2 even_values_first = values_first + even_offset(oddeven_blockid);
+      RandomAccessIterator2 odd_values_first  = even_values_first + tile_size;
+      
+      // load tiles into smem
+      copy_n(context, even_keys_first + rank1, even_values_first + rank1, size1, s_keys, s_values);
+      copy_n(context, odd_keys_first  + rank2, odd_values_first  + rank2, size2, s_keys + size1, s_values + size1);
+
+      context.barrier();
+  
+      // merge the arrays in-place
+      block::inplace_merge_by_key_n(context, s_keys, s_values, size1, size2, comp);
+
+      context.barrier();
+      
+      // write tiles to gmem
+      unsigned int dst_offset = even_offset(oddeven_blockid) + rank1 + rank2;
+      copy_n(context, s_keys, s_values, size1 + size2, keys_result + dst_offset, values_result + dst_offset);
+
+      context.barrier();
+    } // end for i
+  }
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename iterator_value<RandomAccessIterator5>::type KeyType;
+    typedef typename iterator_value<RandomAccessIterator6>::type ValueType;
+  
+    __shared__ uninitialized_array<KeyType,   2 * block_size> s_keys;
+    __shared__ uninitialized_array<ValueType, 2 * block_size> s_values;
+  
+    do_it(s_keys.data(), s_values.data());
+  }
+}; // merge_subtiles_by_key_closure
+
+// merge_subtiles_by_key() merges each sub-tile independently. As explained in rank_splitters(), 
+// the sub-tiles are defined by the ranks of the splitter elements d_rank1 and d_rank2 in the odd and even tiles resp.
+// It can be easily shown that each sub-tile cannot contain more than block_size elements of either the odd or even tile.
+
+// the function calls merge_subblocks_binarysearch_kernel() for the remaining N_splitterS sub-tiles
+// We use 1 thread block per splitter: For instance, thread block 0 will merge rank1[0] -> rank1[1] of array i with
+// rank2[0] -> rank2[1] of array i^1, with i being the thread block to which the splitter belongs.
+
+// We implement each sub-tile merge using a binary search. We compute the rank of each element belonging to a sub-tile 
+// of an odd numbered tile in the corresponding sub-tile of its even numbered pair. It then adds this rank to 
+// the index of the element in its own sub-tile to find the output index of the element in the merged sub-tile.
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename RandomAccessIterator5,
+         typename RandomAccessIterator6,
+         typename RandomAccessIterator7,
+         typename StrictWeakOrdering>
+  void merge_subtiles_by_key(RandomAccessIterator1 keys_first,
+                             RandomAccessIterator1 keys_last,
+                             RandomAccessIterator2 values_first,
+                             RandomAccessIterator3 splitters_pos_first, 
+                             RandomAccessIterator3 splitters_pos_last,
+                             RandomAccessIterator4 ranks_first1,
+                             RandomAccessIterator5 ranks_first2, 
+                             RandomAccessIterator6 keys_result,
+                             RandomAccessIterator7 values_result, 
+                             unsigned int log_tile_size, 
+                             StrictWeakOrdering comp)
+{
+  typedef typename iterator_value<RandomAccessIterator6>::type KeyType;
+  typedef typename iterator_value<RandomAccessIterator7>::type ValueType;
+
+  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
+
+  typedef merge_subtiles_by_key_closure<
+    block_size,
+    RandomAccessIterator1,
+    RandomAccessIterator2,
+    RandomAccessIterator4,
+    RandomAccessIterator5,
+    RandomAccessIterator6,
+    RandomAccessIterator7,
+    StrictWeakOrdering,
+    detail::statically_blocked_thread_array<block_size>
+  > Closure;
+
+  Closure closure(keys_first,
+                  values_first,
+                  keys_last - keys_first, 
+                  ranks_first1,
+                  ranks_first2, 
+                  log_tile_size,
+                  splitters_pos_last - splitters_pos_first,
+  	          keys_result,
+                  values_result,
+                  comp);
+
+  detail::launch_closure(closure, closure.grid_size(), block_size);
+}
+
+
+template<unsigned int block_size,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+  void merge_small_tiles_by_key(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator1 keys_first,
+                                RandomAccessIterator1 keys_last,
+                                RandomAccessIterator2 values_first,
+                                size_t log_tile_size,
+                                RandomAccessIterator3 keys_result,
+                                RandomAccessIterator4 values_result,
+                                StrictWeakOrdering comp)
+{
+  typedef merge_small_tiles_by_key_closure<
+    block_size,
+    RandomAccessIterator1,
+    RandomAccessIterator2,
+    RandomAccessIterator3,
+    RandomAccessIterator4,
+    StrictWeakOrdering,
+    detail::statically_blocked_thread_array<block_size>
+  > Closure;
+
+  Closure closure(keys_first, values_first, keys_last - keys_first, log_tile_size, keys_result, values_result, comp);
+
+  detail::launch_closure(closure, closure.grid_size(), block_size);
+} // end merge_small_tiles_by_key()
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+  void merge_tiles_by_key_recursive(execution_policy<DerivedPolicy> &exec,
+                                    RandomAccessIterator1 keys_first,
+                                    RandomAccessIterator1 keys_last,
+                                    RandomAccessIterator2 values_first,
+                                    RandomAccessIterator3 keys_result,
+                                    RandomAccessIterator4 values_result,
+                                    size_t log_tile_size,
+                                    StrictWeakOrdering comp)
+{
+  typedef typename iterator_value<RandomAccessIterator3>::type KeyType;
+  typedef typename iterator_value<RandomAccessIterator4>::type ValueType;
+
+  const size_t tile_size = 1<<log_tile_size;
+
+  // Compute the block_size based on the types to sort
+  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
+
+  // Case (a): tile_size <= block_size
+  if(tile_size <= block_size)
+  {
+    return merge_small_tiles_by_key<2*block_size>(exec, keys_first, keys_last, values_first, log_tile_size, keys_result, values_result, comp);
+  } // end if
+
+  // Case (b) tile_size >= block_size
+
+  // step 1 of the recursive case: gather one splitter per block_size entries in each odd-even tile pair.
+  thrust::detail::temporary_array<KeyType, DerivedPolicy> splitters(exec, thrust::detail::util::divide_ri(keys_last - keys_first, block_size));
+  static_strided_integer_range<block_size>                splitters_pos(splitters.size());
+  thrust::gather(exec, splitters_pos.begin(), splitters_pos.end(), keys_first, splitters.begin());
+                            
+  // step 2 of the recursive case: merge the splitters & their positions
+  thrust::detail::temporary_array<KeyType,      DerivedPolicy> merged_splitters(exec, splitters.size());
+  thrust::detail::temporary_array<unsigned int, DerivedPolicy> merged_splitters_pos(exec, splitters.size());
+
+  const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
+  size_t log_num_splitters_per_tile = log_tile_size - log_block_size;
+  merge_tiles_by_key_recursive(exec,
+                               splitters.begin(),
+                               splitters.end(),
+                               splitters_pos.begin(),
+                               merged_splitters.begin(),
+                               merged_splitters_pos.begin(),
+                               log_num_splitters_per_tile,
+                               comp);
+
+  // step 3 of the recursive case: find the ranks of each splitter in the respective two tiles.
+  // reuse the merged_splitters_pos storage
+  thrust::detail::temporary_array<unsigned int, DerivedPolicy> &rank1 = merged_splitters_pos;
+  thrust::detail::temporary_array<unsigned int, DerivedPolicy> rank2(exec, rank1.size());
+
+  rank_splitters<block_size>(merged_splitters.begin(),
+                             merged_splitters.end(),
+                             merged_splitters_pos.begin(),
+                             keys_first,
+                             keys_last,
+                             log_tile_size,
+                             rank1.begin(),
+                             rank2.begin(),
+                             comp);
+
+  // step 4 of the recursive case: merge each sub-tile independently in parallel.
+  merge_subtiles_by_key(keys_first,
+                        keys_last,
+                        values_first,
+                        merged_splitters_pos.begin(),
+                        merged_splitters_pos.end(),
+                        rank1.begin(),
+                        rank2.begin(),
+                        keys_result,
+                        values_result,
+                        log_tile_size,
+                        comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename RandomAccessIterator4,
+         typename StrictWeakOrdering>
+  void merge_tiles_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator2 values_first,
+                          size_t n,
+                          RandomAccessIterator3 keys_result,
+                          RandomAccessIterator4 values_result,
+                          unsigned int log_tile_size,
+                          StrictWeakOrdering comp)
+{
+  const unsigned int tile_size = 1 << log_tile_size;
+  const size_t num_tiles = thrust::detail::util::divide_ri(n, tile_size);
+
+  // if there is an odd number of tiles, we should exclude the last one
+  // without a twin in merge_recursive
+  const size_t last_tile_offset = (num_tiles%2)?((num_tiles-1)*tile_size):n;
+
+  merge_tiles_by_key_recursive(exec,
+                               keys_first,
+                               keys_first + last_tile_offset,
+                               values_first,
+                               keys_result,
+                               values_result,
+                               log_tile_size,
+                               comp);
+
+  // copy the last tile without a twin, should it exist
+  if(last_tile_offset < n)
+  {
+    thrust::copy(exec, keys_first + last_tile_offset, keys_first + n, keys_result + last_tile_offset);
+    thrust::copy(exec, values_first + last_tile_offset, values_first + n, values_result + last_tile_offset);
+  } // end if
+} // end merge_tiles_by_key()
+
+
+} // end stable_merge_sort_detail
+
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_merge_sort(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last,
+                       StrictWeakOrdering comp)
+{
+  // XXX it's potentially unsafe to pass the same array for keys & values
+  thrust::system::cuda::detail::detail::stable_merge_sort_by_key(exec, first, last, first, comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator1 keys_first,
+                                RandomAccessIterator1 keys_last,
+                                RandomAccessIterator2 values_first,
+                                StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
+
+  // compute the block_size based on the types we're sorting
+  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
+
+  // XXX WAR unused variable warning issued by nvcc
+  (void) block_size;
+
+  // first, sort each tile of block_size elements
+  stable_sort_by_count<block_size>(exec, keys_first, keys_last, values_first, comp);
+
+  // merge tiles if there is more than one
+  const size_t n = keys_last - keys_first;
+  if(n > block_size)
+  {
+    // allocate scratch space
+    using namespace thrust::detail;
+    using namespace stable_merge_sort_detail;
+    temporary_array<KeyType,   DerivedPolicy> temp_keys(exec, n);
+    temporary_array<ValueType, DerivedPolicy> temp_values(exec, n);
+
+    // use a caching allocator for the calls to merge_tiles_by_key
+    // XXX unfortunately g++-4.2 can't deal with this special execution policy
+#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION < 40300
+    execution_policy<DerivedPolicy> &merge_exec = exec;
+#else
+    cached_temporary_allocator<DerivedPolicy,thrust::cuda::execution_policy> merge_exec(exec);
+#endif
+
+    // The log(n) iterations start here. Each call to 'merge' merges an odd-even pair of tiles
+    unsigned int log_tile_size = thrust::detail::mpl::math::log2<block_size>::value;
+    bool ping = true;
+    for(; (1u << log_tile_size) < n; ++log_tile_size, ping = !ping)
+    {
+      // we ping-pong back and forth
+      if(ping)
+      {
+        merge_tiles_by_key(merge_exec, keys_first, values_first, n, temp_keys.begin(), temp_values.begin(), log_tile_size, comp);
+      } // end if
+      else
+      {
+        merge_tiles_by_key(merge_exec, temp_keys.begin(), temp_values.begin(), n, keys_first, values_first, log_tile_size, comp);
+      } // end else
+    } // end for
+
+    // this is to make sure that our data is finally in the data and keys arrays
+    // and not in the temporary arrays
+    if(!ping)
+    {
+      thrust::copy(exec, temp_keys.begin(), temp_keys.end(), keys_first);
+      thrust::copy(exec, temp_values.begin(), temp_values.end(), values_first);
+    } // end if
+  } // end if
+} // end stable_merge_sort_by_key()
+
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h
new file mode 100644
index 0000000..8449a17
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
+                           RandomAccessIterator first,
+                           RandomAccessIterator last);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first);
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/detail/stable_primitive_sort.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
new file mode 100644
index 0000000..d6f4c77
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
@@ -0,0 +1,159 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
+#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
+#include <thrust/functional.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace stable_primitive_sort_detail
+{
+
+
+template<typename Iterator>
+  struct enable_if_bool_sort
+    : thrust::detail::enable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+template<typename Iterator>
+  struct disable_if_bool_sort
+    : thrust::detail::disable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  typename enable_if_bool_sort<RandomAccessIterator>::type
+    stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator first,
+                          RandomAccessIterator last)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  thrust::stable_partition(exec, first, last, thrust::logical_not<bool>());
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  typename disable_if_bool_sort<RandomAccessIterator>::type
+    stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator first,
+                          RandomAccessIterator last)
+{
+  // call stable_radix_sort
+  thrust::system::cuda::detail::detail::stable_radix_sort(exec,first,last);
+}
+
+
+struct logical_not_first
+{
+  template<typename Tuple>
+  __host__ __device__
+  bool operator()(Tuple t)
+  {
+    return !thrust::get<0>(t);
+  }
+};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  typename enable_if_bool_sort<RandomAccessIterator1>::type
+    stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator1 keys_first,
+                                 RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  thrust::stable_partition(exec,
+                           thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
+                           logical_not_first());
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  typename disable_if_bool_sort<RandomAccessIterator1>::type
+    stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                                 RandomAccessIterator1 keys_first,
+                                 RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // call stable_radix_sort_by_key
+  thrust::system::cuda::detail::detail::stable_radix_sort_by_key(exec, keys_first, keys_last, values_first);
+}
+    
+  
+
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
+                           RandomAccessIterator first,
+                           RandomAccessIterator last)
+{
+  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort(exec,first,last);
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first)
+{
+  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort_by_key(exec, keys_first, keys_last, values_first);
+}
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h
new file mode 100644
index 0000000..7a8b996
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file stable_radix_sort_dev.h
+ *  \brief Defines the interface for a stable radix sort implementation on CUDA
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 keys_first,
+                              RandomAccessIterator1 keys_last,
+                              RandomAccessIterator2 values_first);
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/detail/stable_radix_sort.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl
new file mode 100644
index 0000000..9ea1977
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl
@@ -0,0 +1,220 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+
+// do not attempt to compile this file with any other compiler
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/copy.h>
+#include <thrust/gather.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/util/align.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+
+#include <thrust/system/cuda/detail/detail/b40c/radixsort_api.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator first,
+                       RandomAccessIterator last)
+{
+    typedef typename thrust::iterator_value<RandomAccessIterator>::type K;
+    
+    unsigned int num_elements = last - first;
+
+    // ensure data is properly aligned
+    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first), 2*sizeof(K)))
+    {
+        thrust::detail::temporary_array<K, DerivedPolicy> aligned_keys(exec, first, last);
+        stable_radix_sort(exec, aligned_keys.begin(), aligned_keys.end());
+        thrust::copy(exec, aligned_keys.begin(), aligned_keys.end(), first);
+        return;
+    }
+    
+    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortingEnactor<K> sorter(num_elements);
+    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortStorage<K>    storage;
+    
+    // allocate temporary buffers
+    thrust::detail::temporary_array<K,    DerivedPolicy> temp_keys(exec, num_elements);
+    thrust::detail::temporary_array<int,  DerivedPolicy> temp_spine(exec, sorter.SpineElements());
+    thrust::detail::temporary_array<bool, DerivedPolicy> temp_from_alt(exec, 2);
+
+    // define storage
+    storage.d_keys             = thrust::raw_pointer_cast(&*first);
+    storage.d_alt_keys         = thrust::raw_pointer_cast(&temp_keys[0]);
+    storage.d_spine            = thrust::raw_pointer_cast(&temp_spine[0]);
+    storage.d_from_alt_storage = thrust::raw_pointer_cast(&temp_from_alt[0]);
+
+    // perform the sort
+    sorter.EnactSort(storage);
+    
+    // radix sort sometimes leaves results in the alternate buffers
+    if (storage.using_alternate_storage)
+    {
+        thrust::copy(exec, temp_keys.begin(), temp_keys.end(), first);
+    }
+}
+
+///////////////////////
+// Key-Value Sorting //
+///////////////////////
+
+// sort values directly
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              thrust::detail::true_type)
+{
+    typedef typename thrust::iterator_value<RandomAccessIterator1>::type K;
+    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
+    
+    unsigned int num_elements = last1 - first1;
+
+    // ensure data is properly aligned
+    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first1), 2*sizeof(K)))
+    {
+        thrust::detail::temporary_array<K,DerivedPolicy> aligned_keys(exec, first1, last1);
+        stable_radix_sort_by_key(exec, aligned_keys.begin(), aligned_keys.end(), first2);
+        thrust::copy(exec, aligned_keys.begin(), aligned_keys.end(), first1);
+        return;
+    }
+    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first2), 2*sizeof(V)))
+    {
+        thrust::detail::temporary_array<V,DerivedPolicy> aligned_values(exec, first2, first2 + num_elements);
+        stable_radix_sort_by_key(exec, first1, last1, aligned_values.begin());
+        thrust::copy(exec, aligned_values.begin(), aligned_values.end(), first2);
+        return;
+    }
+   
+    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortingEnactor<K,V> sorter(num_elements);
+    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortStorage<K,V>    storage;
+    
+    // allocate temporary buffers
+    thrust::detail::temporary_array<K,    DerivedPolicy> temp_keys(exec, num_elements);
+    thrust::detail::temporary_array<V,    DerivedPolicy> temp_values(exec, num_elements);
+    thrust::detail::temporary_array<int,  DerivedPolicy> temp_spine(exec, sorter.SpineElements());
+    thrust::detail::temporary_array<bool, DerivedPolicy> temp_from_alt(exec, 2);
+
+    // define storage
+    storage.d_keys             = thrust::raw_pointer_cast(&*first1);
+    storage.d_values           = thrust::raw_pointer_cast(&*first2);
+    storage.d_alt_keys         = thrust::raw_pointer_cast(&temp_keys[0]);
+    storage.d_alt_values       = thrust::raw_pointer_cast(&temp_values[0]);
+    storage.d_spine            = thrust::raw_pointer_cast(&temp_spine[0]);
+    storage.d_from_alt_storage = thrust::raw_pointer_cast(&temp_from_alt[0]);
+
+    // perform the sort
+    sorter.EnactSort(storage);
+    
+    // radix sort sometimes leaves results in the alternate buffers
+    if (storage.using_alternate_storage)
+    {
+        thrust::copy(exec, temp_keys.begin(),   temp_keys.end(),   first1);
+        thrust::copy(exec, temp_values.begin(), temp_values.end(), first2);
+    }
+}
+
+
+// sort values indirectly
+template<typename DerivedPolicy, 
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              thrust::detail::false_type)
+{
+    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
+    
+    unsigned int num_elements = last1 - first1;
+
+    // sort with integer values and then permute the real values accordingly
+    thrust::detail::temporary_array<unsigned int,DerivedPolicy> permutation(exec, num_elements);
+    thrust::sequence(exec, permutation.begin(), permutation.end());
+
+    stable_radix_sort_by_key(exec, first1, last1, permutation.begin());
+    
+    // copy values into temp vector and then permute
+    thrust::detail::temporary_array<V,DerivedPolicy> temp_values(exec, first2, first2 + num_elements);
+   
+    // permute values
+    thrust::gather(exec,
+                   permutation.begin(), permutation.end(),
+                   temp_values.begin(),
+                   first2);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2)
+{
+    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
+
+    // decide how to handle values
+    static const bool sort_values_directly = thrust::detail::is_trivial_iterator<RandomAccessIterator2>::value &&
+                                             thrust::detail::is_arithmetic<V>::value &&
+                                             sizeof(V) <= 8;    // TODO profile this
+
+    // XXX WAR unused variable warning
+    (void) sort_values_directly;
+
+    stable_radix_sort_by_key(exec, first1, last1, first2, 
+                             thrust::detail::integral_constant<bool, sort_values_directly>());
+}
+
+} // end namespace detail
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h
new file mode 100644
index 0000000..b563654
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+
+template<unsigned int count,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Compare>
+void stable_sort_by_count(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          Compare comp);
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/detail/stable_sort_by_count.inl>
+
diff --git a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl
new file mode 100644
index 0000000..5efb36b
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/detail/stable_sort_by_count.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/uninitialized.h>
+#include <thrust/system/cuda/detail/block/merging_sort.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+namespace stable_sort_by_count_detail
+{
+
+
+template<unsigned int block_size,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering,
+         typename Context>
+struct stable_sort_by_count_closure
+{
+  typedef Context context_type;
+
+  RandomAccessIterator1 keys_first;
+  RandomAccessIterator2 values_first;
+  StrictWeakOrdering comp; // XXX this should probably be thrust::detail::device_function
+  const unsigned int n;
+  context_type context;
+
+  stable_sort_by_count_closure(RandomAccessIterator1 keys_first,
+                               RandomAccessIterator2 values_first,
+                               StrictWeakOrdering comp,
+                               const unsigned int n,
+                               context_type context = context_type())
+    : keys_first(keys_first),
+      values_first(values_first),
+      comp(comp),
+      n(n),
+      context(context)
+  {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename iterator_value<RandomAccessIterator1>::type KeyType;
+    typedef typename iterator_value<RandomAccessIterator2>::type ValueType;
+  
+    __shared__ uninitialized_array<KeyType,block_size>   s_keys;
+    __shared__ uninitialized_array<ValueType,block_size> s_data;
+  
+    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
+  
+    // block_offset records the global index of this block's 0th thread
+    unsigned int block_offset = context.block_index() * block_size;
+    unsigned int i = context.thread_index() + block_offset;
+  
+    // advance iterators
+    keys_first   += i;
+    values_first += i;
+  
+    for(;
+        block_offset < n;
+        block_offset += grid_size, i += grid_size, keys_first += grid_size, values_first += grid_size)
+    {
+      context.barrier();
+      // copy input to shared
+      if(i < n)
+      {
+        s_keys[context.thread_index()] = *keys_first;
+        s_data[context.thread_index()] = *values_first;
+      } // end if
+      context.barrier();
+  
+      // this block could be partially full
+      unsigned int length = block_size;
+      if(block_offset + block_size > n)
+      {
+        length = n - block_offset;
+      } // end if
+  
+      // run merge_sort over the block
+      block::merging_sort(context, s_keys.begin(), s_data.begin(), length, comp);
+  
+      // write result
+      if(i < n)
+      {
+        *keys_first   = s_keys[context.thread_index()];
+        *values_first = s_data[context.thread_index()];
+      } // end if
+    } // end for i
+  }
+
+
+  static size_t max_grid_size()
+  {
+    const device_properties_t& properties = device_properties();
+
+    const unsigned int max_threads = properties.maxThreadsPerMultiProcessor * properties.multiProcessorCount;
+    const unsigned int max_blocks  = properties.maxGridSize[0];
+    
+    return thrust::min<size_t>(max_blocks, 3 * max_threads / block_size);
+  } // end max_grid_size()
+
+
+  size_t grid_size() const
+  {
+    // compute the maximum number of blocks we can launch on this arch
+    const unsigned int max_num_blocks = max_grid_size();
+
+    // first, sort within each block
+    size_t num_blocks = n / block_size;
+    if(n % block_size) ++num_blocks;
+
+    return thrust::min<size_t>(num_blocks, max_num_blocks);
+  } // end grid_size()
+}; // stable_sort_by_count_closure
+
+
+} // end stable_sort_by_count_detail
+
+
+template<unsigned int count,
+         typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Compare>
+void stable_sort_by_count(execution_policy<DerivedPolicy> &,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          Compare comp)
+{
+  typedef stable_sort_by_count_detail::stable_sort_by_count_closure<
+    count,
+    RandomAccessIterator1,
+    RandomAccessIterator2,
+    Compare,
+    detail::statically_blocked_thread_array<count>
+  > Closure;
+
+  Closure closure(keys_first, values_first, comp, keys_last - keys_first);
+ 
+  // do an odd-even sort per block of data
+  detail::launch_closure(closure, closure.grid_size(), count);
+} // end stable_sort_by_count()
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/detail/uninitialized.h b/compat/thrust/system/cuda/detail/detail/uninitialized.h
new file mode 100644
index 0000000..a3e3dd2
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/detail/uninitialized.h
@@ -0,0 +1,261 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/detail/alignment.h>
+#include <cstddef>
+#include <new>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+
+template<typename T>
+  class uninitialized
+{
+  private:
+    typename aligned_storage<
+      sizeof(T),
+      alignment_of<T>::value
+    >::type storage;
+
+    __device__ __thrust_forceinline__ const T* ptr() const
+    {
+      return reinterpret_cast<const T*>(storage.data);
+    }
+
+    __device__ __thrust_forceinline__ T* ptr()
+    {
+      return reinterpret_cast<T*>(storage.data);
+    }
+
+  public:
+    // copy assignment
+    __device__ __thrust_forceinline__ uninitialized<T> &operator=(const T &other)
+    {
+      T& self = *this;
+      self = other;
+      return *this;
+    }
+
+    __device__ __thrust_forceinline__ T& get()
+    {
+      return *ptr();
+    }
+
+    __device__ __thrust_forceinline__ const T& get() const
+    {
+      return *ptr();
+    }
+
+    __device__ __thrust_forceinline__ operator T& ()
+    {
+      return get();
+    }
+
+    __device__ __thrust_forceinline__ operator const T&() const
+    {
+      return get();
+    }
+
+    __thrust_forceinline__ __device__ void construct()
+    {
+      ::new(ptr()) T();
+    }
+
+    template<typename Arg>
+    __thrust_forceinline__ __device__ void construct(const Arg &a)
+    {
+      ::new(ptr()) T(a);
+    }
+
+    template<typename Arg1, typename Arg2>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2)
+    {
+      ::new(ptr()) T(a1,a2);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3)
+    {
+      ::new(ptr()) T(a1,a2,a3);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5,a6);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9);
+    }
+
+    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
+    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10)
+    {
+      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10);
+    }
+
+    __thrust_forceinline__ __device__ void destroy()
+    {
+      T& self = *this;
+      self.~T();
+    }
+};
+
+
+template<typename T, std::size_t N>
+  class uninitialized_array
+{
+  public:
+    typedef T             value_type; 
+    typedef T&            reference;
+    typedef const T&      const_reference;
+    typedef T*            pointer;
+    typedef const T*      const_pointer;
+    typedef pointer       iterator;
+    typedef const_pointer const_iterator;
+    typedef std::size_t   size_type;
+
+    __thrust_forceinline__ __device__ iterator begin()
+    {
+      return data();
+    }
+
+    __thrust_forceinline__ __device__ const_iterator begin() const
+    {
+      return data();
+    }
+
+    __thrust_forceinline__ __device__ iterator end()
+    {
+      return begin() + size();
+    }
+
+    __thrust_forceinline__ __device__ const_iterator end() const
+    {
+      return begin() + size();
+    }
+
+    __thrust_forceinline__ __device__ const_iterator cbegin() const
+    {
+      return begin();
+    }
+
+    __thrust_forceinline__ __device__ const_iterator cend() const
+    {
+      return end();
+    }
+
+    __thrust_forceinline__ __device__ size_type size() const
+    {
+      return N;
+    }
+
+    __thrust_forceinline__ __device__ bool empty() const
+    {
+      return false;
+    }
+
+    __thrust_forceinline__ __device__ T* data()
+    {
+      return impl.get();
+    }
+
+    __thrust_forceinline__ __device__ const T* data() const
+    {
+      return impl.get();
+    }
+
+    // element access
+    __thrust_forceinline__ __device__ reference operator[](size_type n)
+    {
+      return data()[n];
+    }
+
+    __thrust_forceinline__ __device__ const_reference operator[](size_type n) const
+    {
+      return data()[n];
+    }
+
+    __thrust_forceinline__ __device__ reference front()
+    {
+      return *data();
+    }
+
+    __thrust_forceinline__ __device__ const_reference front() const
+    {
+      return *data();
+    }
+
+    __thrust_forceinline__ __device__ reference back()
+    {
+      return data()[size() - size_type(1)];
+    }
+
+    __thrust_forceinline__ __device__ const_reference back() const
+    {
+      return data()[size() - size_type(1)];
+    }
+
+  private:
+    uninitialized<T[N]> impl;
+};
+
+
+} // end detail
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/equal.h b/compat/thrust/system/cuda/detail/equal.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/equal.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/error.inl b/compat/thrust/system/cuda/detail/error.inl
new file mode 100644
index 0000000..41b928f
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/error.inl
@@ -0,0 +1,95 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/cuda/error.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+error_code make_error_code(cuda::errc::errc_t e)
+{
+  return error_code(static_cast<int>(e), cuda_category());
+} // end make_error_code()
+
+
+error_condition make_error_condition(cuda::errc::errc_t e)
+{
+  return error_condition(static_cast<int>(e), cuda_category());
+} // end make_error_condition()
+
+
+namespace cuda
+{
+
+namespace detail
+{
+
+
+class cuda_error_category
+  : public error_category
+{
+  public:
+    inline cuda_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "cuda";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      static const std::string unknown_err("Unknown error");
+      const char *c_str = ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+      return c_str ? std::string(c_str) : unknown_err;
+    }
+
+    inline virtual error_condition default_error_condition(int ev) const
+    {
+      using namespace cuda::errc;
+
+      if(ev < ::cudaErrorApiFailureBase)
+      {
+        return make_error_condition(static_cast<errc_t>(ev));
+      }
+
+      return system_category().default_error_condition(ev);
+    }
+}; // end cuda_error_category
+
+} // end detail
+
+} // end namespace cuda
+
+
+const error_category &cuda_category(void)
+{
+  static const cuda::detail::cuda_error_category result;
+  return result;
+}
+
+
+} // end namespace system
+
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/execution_policy.h b/compat/thrust/system/cuda/detail/execution_policy.h
new file mode 100644
index 0000000..7dae04c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/execution_policy.h
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::execution_policy<Derived>
+{
+  // allow conversion to tag
+  inline operator tag () const
+  {
+    return tag();
+  }
+};
+
+
+template<typename System1, typename System2>
+  struct cross_system
+    : thrust::execution_policy<cross_system<System1,System2> >
+{
+  inline __host__ __device__
+  cross_system(thrust::execution_policy<System1> &system1,
+               thrust::execution_policy<System2> &system2)
+    : system1(system1), system2(system2)
+  {}
+
+  thrust::execution_policy<System1> &system1;
+  thrust::execution_policy<System2> &system2;
+
+  inline __host__ __device__
+  cross_system<System2,System1> rotate() const
+  {
+    return cross_system<System2,System1>(system2,system1);
+  }
+};
+
+
+// overloads of select_system
+
+// cpp interop
+template<typename System1, typename System2>
+inline __host__ __device__
+cross_system<System1,System2> select_system(const execution_policy<System1> &system1, const thrust::cpp::execution_policy<System2> &system2)
+{
+  thrust::execution_policy<System1> &non_const_system1 = const_cast<execution_policy<System1>&>(system1);
+  thrust::cpp::execution_policy<System2> &non_const_system2 = const_cast<thrust::cpp::execution_policy<System2>&>(system2);
+  return cross_system<System1,System2>(non_const_system1,non_const_system2);
+}
+
+
+template<typename System1, typename System2>
+inline __host__ __device__
+cross_system<System1,System2> select_system(const thrust::cpp::execution_policy<System1> &system1, execution_policy<System2> &system2)
+{
+  thrust::cpp::execution_policy<System1> &non_const_system1 = const_cast<thrust::cpp::execution_policy<System1>&>(system1);
+  thrust::execution_policy<System2> &non_const_system2 = const_cast<execution_policy<System2>&>(system2);
+  return cross_system<System1,System2>(non_const_system1,non_const_system2);
+}
+
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::cuda::detail::execution_policy;
+using thrust::system::cuda::detail::tag;
+
+} // end cuda
+} // end system
+
+// alias items at top-level
+namespace cuda
+{
+
+using thrust::system::cuda::execution_policy;
+using thrust::system::cuda::tag;
+
+} // end cuda
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/extern_shared_ptr.h b/compat/thrust/system/cuda/detail/extern_shared_ptr.h
new file mode 100644
index 0000000..5f34cc8
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/extern_shared_ptr.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename T>
+  class extern_shared_ptr
+{
+// don't attempt to compile with any compiler other than nvcc
+// due to use of __shared__ below
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  public:
+    __device__
+    inline operator T * (void)
+    {
+      extern __shared__ int4 smem[];
+      return reinterpret_cast<T*>(smem);
+    }
+
+    __device__
+    inline operator const T * (void) const
+    {
+      extern __shared__ int4 smem[];
+      return reinterpret_cast<const T*>(smem);
+    }
+#endif // THRUST_DEVICE_COMPILER_NVCC
+}; // end extern_shared_ptr
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/extrema.h b/compat/thrust/system/cuda/detail/extrema.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/extrema.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/fill.h b/compat/thrust/system/cuda/detail/fill.h
new file mode 100644
index 0000000..9c753bb
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/fill.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.h
+ *  \brief Device implementation of fill.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value);
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/fill.inl>
+
diff --git a/compat/thrust/system/cuda/detail/fill.inl b/compat/thrust/system/cuda/detail/fill.inl
new file mode 100644
index 0000000..3c1feb8
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/fill.inl
@@ -0,0 +1,178 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.inl
+ *  \brief Inline file for fill.h.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/distance.h>
+#include <thrust/detail/util/align.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/internal_functional.h>
+
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace detail
+{
+
+
+template<typename WidePtr, typename T>
+  WidePtr widen_raw_ptr(T *ptr)
+{
+  typedef thrust::detail::pointer_traits<WidePtr> WideTraits;
+  typedef typename WideTraits::element_type       WideT;
+
+  // carefully widen the pointer to avoid warnings about conversions between differently aligned types on ARM
+  WideT *wide_raw_ptr = static_cast<WideT*>(static_cast<void*>(ptr));
+
+  return WideTraits::pointer_to(*wide_raw_ptr);
+}
+
+
+template<typename WideType, typename DerivedPolicy, typename Pointer, typename Size, typename T>
+  Pointer wide_fill_n(execution_policy<DerivedPolicy> &exec,
+                      Pointer first,
+                      Size n,
+                      const T &value)
+{
+  typedef typename thrust::iterator_value<Pointer>::type OutputType;
+
+  size_t ALIGNMENT_BOUNDARY = 128; // begin copying blocks at this byte boundary
+
+  WideType   wide_exemplar;
+  OutputType narrow_exemplars[sizeof(WideType) / sizeof(OutputType)];
+
+  for (size_t i = 0; i < sizeof(WideType) / sizeof(OutputType); i++)
+      narrow_exemplars[i] = static_cast<OutputType>(value);
+
+  // cast through char * to avoid type punning warnings
+  for (size_t i = 0; i < sizeof(WideType); i++)
+      reinterpret_cast<char *>(&wide_exemplar)[i] = reinterpret_cast<char *>(narrow_exemplars)[i];
+
+  OutputType *first_raw = thrust::raw_pointer_cast(first);
+  OutputType *last_raw  = first_raw + n;
+
+  OutputType *block_first_raw = (thrust::min)(first_raw + n,   thrust::detail::util::align_up(first_raw, ALIGNMENT_BOUNDARY));
+  OutputType *block_last_raw  = (thrust::max)(block_first_raw, thrust::detail::util::align_down(last_raw, sizeof(WideType)));
+
+  // rebind Pointer to WideType
+  typedef typename thrust::detail::rebind_pointer<Pointer,WideType>::type WidePtr;
+
+  // point to the widened range
+  // XXX since we've got an execution policy, we probably don't even need to deal with rebinding pointers
+  WidePtr block_first_wide = widen_raw_ptr<WidePtr>(block_first_raw);
+  WidePtr block_last_wide  = widen_raw_ptr<WidePtr>(block_last_raw);
+
+  thrust::generate(exec, first,                   Pointer(block_first_raw),    thrust::detail::fill_functor<OutputType>(value));
+  thrust::generate(exec, block_first_wide,        block_last_wide,             thrust::detail::fill_functor<WideType>(wide_exemplar));
+  thrust::generate(exec, Pointer(block_last_raw), first + n,                   thrust::detail::fill_functor<OutputType>(value));
+
+  return first + n;
+}
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value,
+                        thrust::detail::false_type)
+{
+  thrust::detail::fill_functor<T> func(value); 
+  return thrust::generate_n(exec, first, n, func);
+}
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value,
+                        thrust::detail::true_type)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  
+  if ( thrust::detail::util::is_aligned<OutputType>(thrust::raw_pointer_cast(&*first)) )
+  {
+      if (compute_capability() < 20)
+      {
+        // 32-bit writes are faster on G80 and GT200
+        typedef unsigned int WideType;
+        wide_fill_n<WideType>(exec, &*first, n, value);
+      }
+      else
+      {
+        // 64-bit writes are faster on Fermi
+        typedef unsigned long long WideType;
+        wide_fill_n<WideType>(exec, &*first, n, value);
+      }
+
+      return first + n;
+  }
+  else
+  {
+    return fill_n(exec, first, n, value, thrust::detail::false_type());
+  }
+}
+
+} // end detail
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type      OutputType;
+
+  // we're compiling with nvcc, launch a kernel
+  const bool use_wide_fill = thrust::detail::is_trivial_iterator<OutputIterator>::value
+      && thrust::detail::has_trivial_assign<OutputType>::value
+      && (sizeof(OutputType) == 1 || sizeof(OutputType) == 2 || sizeof(OutputType) == 4);
+
+  // XXX WAR usused variable warning
+  (void)use_wide_fill;
+
+  return detail::fill_n(exec, first, n, value, thrust::detail::integral_constant<bool, use_wide_fill>());
+}
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  thrust::system::cuda::detail::fill_n(exec, first, thrust::distance(first,last), value);
+} // end fill()
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/find.h b/compat/thrust/system/cuda/detail/find.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/find.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/for_each.h b/compat/thrust/system/cuda/detail/for_each.h
new file mode 100644
index 0000000..56be13b
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/for_each.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Defines the interface for a function that executes a 
+ *  function or functional for each value in a given range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &s,
+                                  RandomAccessIterator first,
+                                  Size n,
+                                  UnaryFunction f);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/for_each.inl>
+
diff --git a/compat/thrust/system/cuda/detail/for_each.inl b/compat/thrust/system/cuda/detail/for_each.inl
new file mode 100644
index 0000000..be6e561
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/for_each.inl
@@ -0,0 +1,199 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.inl
+ *  \brief Inline file for for_each.h.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/launch_calculator.h>
+#include <thrust/detail/util/blocking.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace for_each_n_detail
+{
+
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction,
+         typename Context>
+struct for_each_n_closure
+{
+  typedef void result_type;
+  typedef Context context_type;
+
+  RandomAccessIterator first;
+  Size n;
+  thrust::detail::device_function<UnaryFunction,void> f;
+  Context context;
+
+  for_each_n_closure(RandomAccessIterator first,
+                     Size n,
+                     UnaryFunction f,
+                     Context context = Context())
+    : first(first), n(n), f(f), context(context)
+  {}
+
+  __device__ __thrust_forceinline__
+  result_type operator()(void)
+  {
+    const Size grid_size = context.block_dimension() * context.grid_dimension();
+
+    Size i = context.linear_index();
+
+    // advance iterator
+    first += i;
+
+    while(i < n)
+    {
+      f(*first);
+      i += grid_size;
+      first += grid_size;
+    }
+  }
+}; // end for_each_n_closure
+
+
+template<typename Closure, typename Size>
+thrust::tuple<size_t,size_t> configure_launch(Size n)
+{
+  // calculate launch configuration
+  detail::launch_calculator<Closure> calculator;
+  
+  thrust::tuple<size_t, size_t, size_t> config = calculator.with_variable_block_size();
+  size_t max_blocks = thrust::get<0>(config);
+  size_t block_size = thrust::get<1>(config);
+  size_t num_blocks = thrust::min(max_blocks, thrust::detail::util::divide_ri<size_t>(n, block_size));
+
+  return thrust::make_tuple(num_blocks, block_size);
+}
+
+
+template<typename Size>
+bool use_big_closure(Size n, unsigned int little_grid_size)
+{
+  // use the big closure when n will not fit within an unsigned int
+  // or if incrementing an unsigned int by little_grid_size would overflow
+  // the counter
+  
+  Size threshold = std::numeric_limits<unsigned int>::max();
+
+  bool result = (sizeof(Size) > sizeof(unsigned int)) && (n > threshold);
+
+  if(!result)
+  {
+    // check if we'd overflow the little closure's counter
+    unsigned int little_n = static_cast<unsigned int>(n);
+
+    if((little_n - 1u) + little_grid_size < little_n)
+    {
+      result = true;
+    }
+  }
+
+  return result;
+}
+
+
+} // end for_each_n_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator first,
+                                Size n,
+                                UnaryFunction f)
+{
+  // we're attempting to launch a kernel, assert we're compiling with nvcc
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+
+  if(n <= 0) return first;  // empty range
+  
+  // create two candidate closures to implement the for_each
+  // choose between them based on the whether we can fit n into a smaller integer
+  // and whether or not we'll overflow the closure's counter
+
+  typedef detail::blocked_thread_array Context;
+  typedef for_each_n_detail::for_each_n_closure<RandomAccessIterator, Size, UnaryFunction, Context>         BigClosure;
+  typedef for_each_n_detail::for_each_n_closure<RandomAccessIterator, unsigned int, UnaryFunction, Context> LittleClosure;
+
+  BigClosure    big_closure(first, n, f);
+  LittleClosure little_closure(first, static_cast<unsigned int>(n), f);
+
+  thrust::tuple<size_t, size_t> little_config = for_each_n_detail::configure_launch<LittleClosure>(n);
+
+  unsigned int little_grid_size = thrust::get<0>(little_config) * thrust::get<1>(little_config);
+
+  if(for_each_n_detail::use_big_closure(n, little_grid_size))
+  {
+    // launch the big closure
+    thrust::tuple<size_t, size_t> big_config = for_each_n_detail::configure_launch<BigClosure>(n);
+    detail::launch_closure(big_closure, thrust::get<0>(big_config), thrust::get<1>(big_config));
+  }
+  else
+  {
+    // launch the little closure
+    detail::launch_closure(little_closure, thrust::get<0>(little_config), thrust::get<1>(little_config));
+  }
+
+  return first + n;
+} 
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+  InputIterator for_each(execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         UnaryFunction f)
+{
+  return cuda::detail::for_each_n(exec, first, thrust::distance(first,last), f);
+} // end for_each()
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/gather.h b/compat/thrust/system/cuda/detail/gather.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/gather.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/generate.h b/compat/thrust/system/cuda/detail/generate.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/generate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/get_value.h b/compat/thrust/system/cuda/detail/get_value.h
new file mode 100644
index 0000000..273023f
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/get_value.h
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/assign_value.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+namespace
+{
+
+
+template<typename DerivedPolicy, typename Pointer>
+inline __host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value_msvc2005_war(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+{
+  typedef typename thrust::iterator_value<Pointer>::type result_type;
+
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static result_type host_path(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+    {
+      // when called from host code, implement with assign_value
+      // note that this requires a type with default constructor
+      result_type result;
+
+      thrust::host_system_tag host_tag;
+      cross_system<thrust::host_system_tag, DerivedPolicy> systems(host_tag, exec);
+      assign_value(systems, &result, ptr);
+
+      return result;
+    }
+
+    __device__ inline static result_type device_path(execution_policy<DerivedPolicy> &, Pointer ptr)
+    {
+      // when called from device code, just do simple deref
+      return *thrust::raw_pointer_cast(ptr);
+    }
+  };
+
+#ifndef __CUDA_ARCH__
+  return war_nvbugs_881631::host_path(exec, ptr);
+#else
+  return war_nvbugs_881631::device_path(exec, ptr);
+#endif // __CUDA_ARCH__
+} // end get_value_msvc2005_war()
+
+
+} // end anon namespace
+
+
+template<typename DerivedPolicy, typename Pointer>
+inline __host__ __device__
+  typename thrust::iterator_value<Pointer>::type
+    get_value(execution_policy<DerivedPolicy> &exec, Pointer ptr)
+{
+  return get_value_msvc2005_war(exec,ptr);
+} // end get_value()
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h b/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
new file mode 100644
index 0000000..e6c0d28
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to check for the existence of macros
+// such as __host__ and __device__, which may already be defined by thrust
+// and to undefine them before entering cuda_runtime_api.h (which will redefine them)
+
+// we only try to do this stuff if cuda/include/host_defines.h has been included
+#if !defined(__HOST_DEFINES_H__)
+
+#ifdef __host__
+#undef __host__
+#endif // __host__
+
+#ifdef __device__
+#undef __device__
+#endif // __device__
+
+#endif // __HOST_DEFINES_H__
+
+#include <cuda_runtime_api.h>
+
diff --git a/compat/thrust/system/cuda/detail/inner_product.h b/compat/thrust/system/cuda/detail/inner_product.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/inner_product.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/iter_swap.h b/compat/thrust/system/cuda/detail/iter_swap.h
new file mode 100644
index 0000000..9b2bcf0
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/iter_swap.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/swap.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename Pointer1, typename Pointer2>
+inline __host__ __device__
+void iter_swap(tag, Pointer1 a, Pointer2 b)
+{
+  // XXX war nvbugs/881631
+  struct war_nvbugs_881631
+  {
+    __host__ inline static void host_path(Pointer1 a, Pointer2 b)
+    {
+      thrust::swap_ranges(a, a + 1, b);
+    }
+
+    __device__ inline static void device_path(Pointer1 a, Pointer2 b)
+    {
+      using thrust::swap;
+      swap(*thrust::raw_pointer_cast(a),
+           *thrust::raw_pointer_cast(b));
+    }
+  };
+
+#ifndef __CUDA_ARCH__
+  return war_nvbugs_881631::host_path(a,b);
+#else
+  return war_nvbugs_881631::device_path(a,b);
+#endif // __CUDA_ARCH__
+} // end iter_swap()
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/logical.h b/compat/thrust/system/cuda/detail/logical.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/logical.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/malloc_and_free.h b/compat/thrust/system/cuda/detail/malloc_and_free.h
new file mode 100644
index 0000000..676dd7c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/malloc_and_free.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system/detail/bad_alloc.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+// note that malloc returns a raw pointer to avoid
+// depending on the heavyweight thrust/system/cuda/memory.h header
+template<typename DerivedPolicy>
+  void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
+{
+  void *result = 0;
+
+  cudaError_t error = cudaMalloc(reinterpret_cast<void**>(&result), n);
+
+  if(error)
+  {
+    throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(error).c_str());
+  } // end if
+
+  return result;
+} // end malloc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void free(execution_policy<DerivedPolicy> &, Pointer ptr)
+{
+  cudaError_t error = cudaFree(thrust::raw_pointer_cast(ptr));
+
+  if(error)
+  {
+    throw thrust::system_error(error, thrust::cuda_category());
+  } // end error
+} // end free()
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/memory.inl b/compat/thrust/system/cuda/detail/memory.inl
new file mode 100644
index 0000000..998b54e
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/memory.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system/cuda/detail/malloc_and_free.h>
+#include <limits>
+
+namespace thrust
+{
+
+// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
+//     pointer_raw_pointer for pointer by specializing it here
+//     note that we specialize it here, before the use of raw_pointer_cast
+//     below, which causes pointer_raw_pointer's instantiation
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+namespace detail
+{
+
+template<typename T>
+  struct pointer_raw_pointer< thrust::cuda::pointer<T> >
+{
+  typedef typename thrust::cuda::pointer<T>::raw_pointer type;
+}; // end pointer_raw_pointer
+
+} // end detail
+#endif
+
+namespace system
+{
+namespace cuda
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+pointer<void> malloc(std::size_t n)
+{
+  tag cuda_tag;
+  return pointer<void>(thrust::system::cuda::detail::malloc(cuda_tag, n));
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::cuda::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+void free(pointer<void> ptr)
+{
+  tag cuda_tag;
+  return thrust::system::cuda::detail::free(cuda_tag, ptr.get());
+} // end free()
+
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/merge.h b/compat/thrust/system/cuda/detail/merge.h
new file mode 100644
index 0000000..e01b705
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/merge.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename StrictWeakOrdering>
+  RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
+                              RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              RandomAccessIterator2 last2,
+                              RandomAccessIterator3 result,
+                              StrictWeakOrdering comp);
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/merge.inl>
+
diff --git a/compat/thrust/system/cuda/detail/merge.inl b/compat/thrust/system/cuda/detail/merge.inl
new file mode 100644
index 0000000..bf7516f
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/merge.inl
@@ -0,0 +1,285 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/merge.h>
+#include <thrust/pair.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/cuda/detail/detail/uninitialized.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/detail/util/blocking.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace merge_detail
+{
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Size,
+         typename Compare>
+__device__ __thrust_forceinline__
+thrust::pair<Size,Size>
+  partition_search(RandomAccessIterator1 first1,
+                   RandomAccessIterator2 first2,
+                   Size diag,
+                   Size lower_bound1,
+                   Size upper_bound1,
+                   Size lower_bound2,
+                   Size upper_bound2,
+                   Compare comp)
+{
+  Size begin = thrust::max<Size>(lower_bound1, diag - upper_bound2);
+  Size end   = thrust::min<Size>(diag - lower_bound2, upper_bound1);
+
+  while(begin < end)
+  {
+    Size mid = (begin + end) / 2;
+    Size index1 = mid;
+    Size index2 = diag - mid - 1;
+
+    if(comp(first2[index2], first1[index1]))
+    {
+      end = mid;
+    }
+    else
+    {
+      begin = mid + 1;
+    }
+  }
+
+  return thrust::make_pair(begin, diag - begin);
+}
+
+
+template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
+__device__ __thrust_forceinline__
+void merge_n(Context &ctx,
+             RandomAccessIterator1 first1,
+             Size n1,
+             RandomAccessIterator2 first2,
+             Size n2,
+             RandomAccessIterator3 result,
+             Compare comp_,
+             unsigned int work_per_thread)
+{
+  const unsigned int block_size = ctx.block_dimension();
+  thrust::detail::device_function<Compare,bool> comp(comp_);
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  Size result_size = n1 + n2;
+
+  // this is just oversubscription_rate * block_size * work_per_thread
+  // but it makes no sense to send oversubscription_rate as an extra parameter
+  Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());
+
+  using thrust::system::cuda::detail::detail::uninitialized;
+  __shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;
+
+  Size result_block_offset = ctx.block_index() * work_per_block;
+
+  // find where this block's input begins in both input sequences
+  if(ctx.thread_index() == 0)
+  {
+    s_block_input_begin = (ctx.block_index() == 0) ?
+      thrust::pair<Size,Size>(0,0) :
+      partition_search(first1, first2,
+                       result_block_offset,
+                       Size(0), n1,
+                       Size(0), n2,
+                       comp);
+  }
+
+  ctx.barrier();
+
+  // iterate to consume this block's input
+  Size work_per_iteration = block_size * work_per_thread;
+  thrust::pair<Size,Size> block_input_end = s_block_input_begin;
+  block_input_end.first  += work_per_iteration;
+  block_input_end.second += work_per_iteration;
+  Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);
+
+  for(;
+      result_block_offset < result_block_offset_last;
+      result_block_offset += work_per_iteration,
+      block_input_end.first  += work_per_iteration,
+      block_input_end.second += work_per_iteration
+     )
+  {
+    // find where this thread's input begins in both input sequences for this iteration
+    thrust::pair<Size,Size> thread_input_begin =
+      partition_search(first1, first2,
+                       Size(result_block_offset + ctx.thread_index() * work_per_thread),
+                       s_block_input_begin.get().first,  thrust::min<Size>(block_input_end.first , n1),
+                       s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),
+                       comp);
+
+    ctx.barrier();
+
+    // XXX the performance impact of not keeping x1 & x2
+    //     in registers is about 10% for int32
+    uninitialized<value_type1> x1;
+    uninitialized<value_type2> x2;
+
+    // XXX this is just a serial merge -- try to simplify or abstract this loop
+    Size i = result_block_offset + ctx.thread_index() * work_per_thread;
+    Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);
+    for(;
+        i < last_i;
+        ++i)
+    {
+      // optionally load x1 & x2
+      bool output_x2 = true;
+      if(thread_input_begin.second < n2)
+      {
+        x2 = first2[thread_input_begin.second];
+      }
+      else
+      {
+        output_x2 = false;
+      }
+
+      if(thread_input_begin.first < n1)
+      {
+        x1 = first1[thread_input_begin.first];
+
+        if(output_x2)
+        {
+          output_x2 = comp(x2.get(), x1.get());
+        }
+      }
+
+      result[i] = output_x2 ? x2.get() : x1.get();
+
+      if(output_x2)
+      {
+        ++thread_input_begin.second;
+      }
+      else
+      {
+        ++thread_input_begin.first;
+      }
+    } // end for
+
+    // the block's last thread has conveniently located the
+    // beginning of the next iteration's input
+    if(ctx.thread_index() == block_size-1)
+    {
+      s_block_input_begin = thread_input_begin;
+    }
+    ctx.barrier();
+  } // end for
+} // end merge_n
+
+
+template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
+  struct merge_n_closure
+{
+  typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;
+
+  RandomAccessIterator1 first1;
+  Size n1;
+  RandomAccessIterator2 first2;
+  Size n2;
+  RandomAccessIterator3 result;
+  Compare comp;
+  Size work_per_thread;
+
+  merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)
+    : first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)
+  {}
+
+  __device__ __forceinline__
+  void operator()()
+  {
+    context_type ctx;
+    merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);
+  }
+};
+
+
+// returns (work_per_thread, threads_per_block, oversubscription_factor)
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
+  thrust::tuple<unsigned int,unsigned int,unsigned int>
+    tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)
+{
+  // determined by empirical testing on GTX 480
+  // ~4500 Mkeys/s on GTX 480
+  const unsigned int work_per_thread         = 5;
+  const unsigned int threads_per_block       = 128;
+  const unsigned int oversubscription_factor = 30;
+
+  return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);
+}
+
+
+} // end merge_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
+                            RandomAccessIterator1 first1,
+                            RandomAccessIterator1 last1,
+                            RandomAccessIterator2 first2,
+                            RandomAccessIterator2 last2,
+                            RandomAccessIterator3 result,
+                            Compare comp)
+{
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
+  Size n1 = last1 - first1;
+  Size n2 = last2 - first2;
+  typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;
+
+  // empty result
+  if(n <= 0) return result;
+
+  unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;
+  thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)
+    = merge_detail::tunables(first1, last1, first2, last2, result, comp);
+
+  const unsigned int work_per_block = work_per_thread * threads_per_block;
+
+  const unsigned int num_processors = device_properties().multiProcessorCount;
+  const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));
+
+  typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;
+  closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);
+
+  detail::launch_closure(closure, num_blocks, threads_per_block);
+
+  return result + n1 + n2;
+} // end merge()
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/mismatch.h b/compat/thrust/system/cuda/detail/mismatch.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/mismatch.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/par.h b/compat/thrust/system/cuda/detail/par.h
new file mode 100644
index 0000000..e56128c
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/par.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/execute_with_allocator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::cuda::detail::execution_policy<par_t>
+{
+  par_t() : thrust::system::cuda::detail::execution_policy<par_t>() {}
+
+  template<typename Allocator>
+    thrust::detail::execute_with_allocator<Allocator, thrust::system::cuda::detail::execution_policy>
+      operator()(Allocator &alloc) const
+  {
+    return thrust::detail::execute_with_allocator<Allocator, thrust::system::cuda::detail::execution_policy>(alloc);
+  }
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end cuda
+} // end system
+
+
+// alias par here
+namespace cuda
+{
+
+
+using thrust::system::cuda::par;
+
+
+} // end cuda
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/partition.h b/compat/thrust/system/cuda/detail/partition.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/partition.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/reduce.h b/compat/thrust/system/cuda/detail/reduce.h
new file mode 100644
index 0000000..d188f60
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief Reduce a sequence of elements with a given length.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/reduce.inl>
+
diff --git a/compat/thrust/system/cuda/detail/reduce.inl b/compat/thrust/system/cuda/detail/reduce.inl
new file mode 100644
index 0000000..66b4ac7
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce.inl
@@ -0,0 +1,275 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.inl
+ *  \brief Inline file for reduce.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/detail/generic/select_system.h>
+
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+#include <thrust/system/cuda/detail/extern_shared_ptr.h>
+#include <thrust/system/cuda/detail/block/reduce.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/launch_calculator.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+namespace reduce_detail
+{
+
+/*
+ * Reduce a vector of n elements using binary_op()
+ *
+ * The order of reduction is not defined, so binary_op() should
+ * be a commutative (and associative) operator such as 
+ * (integer) addition.  Since floating point operations
+ * do not completely satisfy these criteria, the result is 
+ * generally not the same as a consecutive reduction of 
+ * the elements.
+ * 
+ * Uses the same pattern as reduce6() in the CUDA SDK
+ *
+ */
+template <typename InputIterator,
+          typename Size,
+          typename T,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Context>
+struct unordered_reduce_closure
+{
+  InputIterator  input;
+  Size           n;
+  T              init;
+  OutputIterator output;
+  BinaryFunction binary_op;
+  unsigned int shared_array_size;
+
+  typedef Context context_type;
+  context_type context;
+
+  unordered_reduce_closure(InputIterator input, Size n, T init, OutputIterator output, BinaryFunction binary_op, unsigned int shared_array_size, Context context = Context())
+    : input(input), n(n), init(init), output(output), binary_op(binary_op), shared_array_size(shared_array_size), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+    extern_shared_ptr<OutputType>  shared_array;
+
+    Size grid_size = context.block_dimension() * context.grid_dimension();
+
+    Size i = context.linear_index();
+      
+    input += i;
+
+    // compute reduction with all blockDim.x threads
+    OutputType sum = thrust::raw_reference_cast(*input);
+
+    i     += grid_size;
+    input += grid_size;
+
+    while (i < n)
+    {
+      OutputType val = thrust::raw_reference_cast(*input);
+
+      sum = binary_op(sum, val);
+
+      i      += grid_size;
+      input  += grid_size;
+    }
+
+    // write first shared_array_size values into shared memory
+    if (context.thread_index() < shared_array_size)
+      shared_array[context.thread_index()] = sum;  
+
+    // accumulate remaining values (if any) to shared memory in stages
+    if (context.block_dimension() > shared_array_size)
+    {
+      unsigned int lb = shared_array_size;
+      unsigned int ub = shared_array_size + lb;
+      
+      while (lb < context.block_dimension())
+      {
+        context.barrier();
+
+        if (lb <= context.thread_index() && context.thread_index() < ub)
+        {
+          OutputType tmp = shared_array[context.thread_index() - lb];
+          shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
+        }
+
+        lb += shared_array_size;
+        ub += shared_array_size;
+      }
+    }
+    
+    context.barrier();
+
+    block::reduce_n(context, shared_array, thrust::min<unsigned int>(context.block_dimension(), shared_array_size), binary_op);
+  
+    if (context.thread_index() == 0)
+    {
+      OutputType tmp = shared_array[0];
+
+      if (context.grid_dimension() == 1)
+        tmp = binary_op(init, tmp);
+
+      output += context.block_index();
+      *output = tmp;
+    }
+  }
+};
+
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  // we're attempting to launch a kernel, assert we're compiling with nvcc
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+
+  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;
+
+  difference_type n = thrust::distance(first,last);
+
+  if (n == 0)
+    return init;
+
+  typedef thrust::detail::temporary_array<OutputType, DerivedPolicy> OutputArray;
+  typedef typename OutputArray::iterator OutputIterator;
+
+  typedef detail::blocked_thread_array Context;
+  typedef unordered_reduce_closure<InputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;
+    
+  function_attributes_t attributes = detail::closure_attributes<Closure>();
+  
+  // TODO chose this in a more principled manner
+  size_t threshold = thrust::max<size_t>(2 * attributes.maxThreadsPerBlock, 1024);
+
+  device_properties_t properties = device_properties();
+
+  // launch configuration
+  size_t num_blocks; 
+  size_t block_size; 
+  size_t array_size; 
+  size_t smem_bytes; 
+
+  // first level reduction
+  if (static_cast<size_t>(n) < threshold)
+  {
+    num_blocks = 1;
+    block_size = thrust::min(static_cast<size_t>(n), static_cast<size_t>(attributes.maxThreadsPerBlock));
+    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
+    smem_bytes = sizeof(OutputType) * array_size;
+  }
+  else
+  {
+    detail::launch_calculator<Closure> calculator;
+    
+    thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
+
+    num_blocks = thrust::min(thrust::get<0>(config), static_cast<size_t>(n) / thrust::get<1>(config));
+    block_size = thrust::get<1>(config);
+    array_size = thrust::min(block_size, thrust::get<2>(config) / sizeof(OutputType));
+    smem_bytes = sizeof(OutputType) * array_size;
+  }
+ 
+  // TODO assert(n <= num_blocks * block_size);
+  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
+
+  OutputArray output(exec, num_blocks);
+
+  Closure closure(first, n, init, output.begin(), binary_op, array_size);
+  
+  //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;
+
+  detail::launch_closure(closure, num_blocks, block_size, smem_bytes);
+
+  // second level reduction
+  if (num_blocks > 1)
+  {
+    typedef detail::blocked_thread_array Context;
+    typedef unordered_reduce_closure<OutputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;
+
+    function_attributes_t attributes = detail::closure_attributes<Closure>();
+
+    num_blocks = 1;
+    block_size = thrust::min(output.size(), static_cast<size_t>(attributes.maxThreadsPerBlock));
+    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
+    smem_bytes = sizeof(OutputType) * array_size;
+  
+    // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
+
+    Closure closure(output.begin(), output.size(), init, output.begin(), binary_op, array_size);
+
+    //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;
+
+    detail::launch_closure(closure, num_blocks, block_size, smem_bytes);
+  }
+  
+  return output[0];
+} // end reduce
+
+} // end reduce_detail
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  return reduce_detail::reduce(exec, first, last, init, binary_op);
+} // end reduce()
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/reduce_by_key.h b/compat/thrust/system/cuda/detail/reduce_by_key.h
new file mode 100644
index 0000000..9b8ec10
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce_by_key.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_by_key.h
+ *  \brief CUDA implementation of reduce_by_key
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/reduce_by_key.inl>
+
diff --git a/compat/thrust/system/cuda/detail/reduce_by_key.inl b/compat/thrust/system/cuda/detail/reduce_by_key.inl
new file mode 100644
index 0000000..18dc1e4
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce_by_key.inl
@@ -0,0 +1,705 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/system/detail/generic/select_system.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
+
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/system/cuda/detail/default_decomposition.h>
+#include <thrust/system/cuda/detail/block/inclusive_scan.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/reduce_intervals.h>
+#include <thrust/system/cuda/detail/detail/uninitialized.h>
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace reduce_by_key_detail
+{
+
+template <typename FlagType, typename IndexType, typename KeyType, typename BinaryPredicate>
+struct tail_flag_functor
+{
+  BinaryPredicate binary_pred; // NB: this must be the first member for performance reasons
+  IndexType n;
+
+  typedef FlagType result_type;
+  
+  tail_flag_functor(IndexType n, BinaryPredicate binary_pred)
+    : n(n), binary_pred(binary_pred)
+  {}
+  
+  // XXX why is this noticably faster?  (it may read past the end of input)
+  //FlagType operator()(const thrust::tuple<IndexType,KeyType,KeyType>& t) const
+  
+  template <typename Tuple>
+  __host__ __device__ __thrust_forceinline__
+  FlagType operator()(const Tuple& t)
+  {
+    if (thrust::get<0>(t) == (n - 1) || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)))
+      return 1;
+    else
+      return 0;
+  }
+};
+
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool FullBlock,
+          typename Context,
+          typename FlagIterator,
+          typename FlagType>
+__device__ __thrust_forceinline__
+FlagType load_flags(Context context,
+                    const unsigned int n,
+                    FlagIterator iflags,
+                    FlagType  (&sflag)[CTA_SIZE])
+{
+  FlagType flag_bits = 0;
+
+  // load flags in unordered fashion
+  for(unsigned int k = 0; k < K; k++)
+  {
+    const unsigned int offset = k*CTA_SIZE + context.thread_index();
+
+    if (FullBlock || offset < n)
+    {
+      FlagIterator temp = iflags + offset;
+      if (*temp)
+        flag_bits |= FlagType(1) << k;
+    }
+  }
+
+  sflag[context.thread_index()] = flag_bits;
+  
+  context.barrier();
+
+  flag_bits = 0;
+
+  // obtain flags for iflags[K * context.thread_index(), K * context.thread_index() + K)
+  for(unsigned int k = 0; k < K; k++)
+  {
+    const unsigned int offset = K * context.thread_index() + k;
+
+    if (FullBlock || offset < n)
+    {
+      flag_bits |= ((sflag[offset % CTA_SIZE] >> (offset / CTA_SIZE)) & FlagType(1)) << k;
+    }
+  }
+
+  context.barrier();
+  
+  sflag[context.thread_index()] = flag_bits;
+  
+  context.barrier();
+
+  return flag_bits;
+}
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool FullBlock,
+          typename Context,
+          typename InputIterator2,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void load_values(Context context,
+                 const unsigned int n,
+                 InputIterator2 ivals,
+                 ValueType (&sdata)[K][CTA_SIZE + 1])
+{
+  for(unsigned int k = 0; k < K; k++)
+  {
+    const unsigned int offset = k*CTA_SIZE + context.thread_index();
+
+    if (FullBlock || offset < n)
+    {
+      InputIterator2 temp = ivals + offset;
+      sdata[offset % K][offset / K] = *temp;
+    }
+  }
+
+  context.barrier();
+}
+
+
+template <unsigned int CTA_SIZE,
+          unsigned int K,
+          bool FullBlock,
+          typename Context,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction,
+          typename FlagIterator,
+          typename FlagType,
+          typename IndexType,
+          typename ValueType>
+__device__ __thrust_forceinline__
+void reduce_by_key_body(Context context,
+                        const unsigned int n,
+                        InputIterator1   ikeys,
+                        InputIterator2   ivals,
+                        OutputIterator1  okeys,
+                        OutputIterator2  ovals,
+                        BinaryPredicate  binary_pred,
+                        BinaryFunction   binary_op,
+                        FlagIterator     iflags,
+                        FlagType  (&sflag)[CTA_SIZE],
+                        ValueType (&sdata)[K][CTA_SIZE + 1],
+                        bool&      carry_in,
+                        IndexType& carry_index,
+                        ValueType& carry_value)
+{
+  // load flags
+  const FlagType flag_bits  = load_flags<CTA_SIZE,K,FullBlock>(context, n, iflags, sflag);
+  const FlagType flag_count = __popc(flag_bits); // TODO hide this behind a template
+  const FlagType left_flag  = (context.thread_index() == 0) ? 0 : sflag[context.thread_index() - 1];
+  const FlagType head_flag  = (context.thread_index() == 0 || flag_bits & ((1 << (K - 1)) - 1) || left_flag & (1 << (K - 1))) ? 1 : 0;
+  
+  context.barrier();
+
+  // scan flag counts
+  sflag[context.thread_index()] = flag_count; context.barrier();
+
+  block::inclusive_scan(context, sflag, thrust::plus<FlagType>());
+
+  const FlagType output_position = (context.thread_index() == 0) ? 0 : sflag[context.thread_index() - 1];
+  const FlagType num_outputs     = sflag[CTA_SIZE - 1];
+
+  context.barrier();
+
+  // shuffle keys and write keys out
+  if (!thrust::detail::is_discard_iterator<OutputIterator1>::value)
+  {
+    // XXX this could be improved
+    for (unsigned int i = 0; i < num_outputs; i += CTA_SIZE)
+    {
+      FlagType position = output_position;
+
+      for(unsigned int k = 0; k < K; k++)
+      {
+        if (flag_bits & (FlagType(1) << k))
+        {
+          if (i <= position && position < i + CTA_SIZE)
+            sflag[position - i] = K * context.thread_index() + k;
+          position++;
+        }
+      }
+
+      context.barrier();
+
+      if (i + context.thread_index() < num_outputs)
+      {
+        InputIterator1  tmp1 = ikeys + sflag[context.thread_index()];
+        OutputIterator1 tmp2 = okeys + (i + context.thread_index());
+        *tmp2 = *tmp1; 
+      }
+      
+      context.barrier();
+    }
+  }
+
+  // load values
+  load_values<CTA_SIZE,K,FullBlock> (context, n, ivals, sdata);
+
+  ValueType ldata[K];
+  for (unsigned int k = 0; k < K; k++)
+      ldata[k] = sdata[k][context.thread_index()];
+
+  // carry in (if necessary)
+  if (context.thread_index() == 0 && carry_in)
+  {
+    // XXX WAR sm_10 issue
+    ValueType tmp1 = carry_value;
+    ldata[0] = binary_op(tmp1, ldata[0]);
+  }
+
+  context.barrier();
+
+  // sum local values
+  {
+    for(unsigned int k = 1; k < K; k++)
+    {
+      const unsigned int offset = K * context.thread_index() + k;
+
+      if (FullBlock || offset < n)
+      {
+        if (!(flag_bits & (FlagType(1) << (k - 1))))
+          ldata[k] = binary_op(ldata[k - 1], ldata[k]);
+      }
+    }
+  }
+
+  // second level segmented scan
+  {
+    // use head flags for segmented scan
+    sflag[context.thread_index()] = head_flag;  sdata[K - 1][context.thread_index()] = ldata[K - 1]; context.barrier();
+
+    if (FullBlock)
+      block::inclusive_scan_by_flag(context, sflag, sdata[K-1], binary_op);
+    else
+      block::inclusive_scan_by_flag_n(context, sflag, sdata[K-1], n, binary_op);
+  }
+
+  // update local values
+  if (context.thread_index() > 0)
+  {
+    unsigned int update_bits  = (flag_bits << 1) | (left_flag >> (K - 1));
+// TODO remove guard
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+    unsigned int update_count = __ffs(update_bits) - 1u; // NB: this might wrap around to UINT_MAX
+#else
+    unsigned int update_count = 0;
+#endif // THRUST_DEVICE_COMPILER_NVCC
+
+    if (!FullBlock && (K + 1) * context.thread_index() > n)
+      update_count = thrust::min(n - K * context.thread_index(), update_count);
+
+    ValueType left = sdata[K - 1][context.thread_index() - 1];
+
+    for(unsigned int k = 0; k < K; k++)
+    {
+      if (k < update_count)
+        ldata[k] = binary_op(left, ldata[k]);
+    }
+  }
+  
+  context.barrier();
+
+  // store carry out
+  if (FullBlock)
+  {
+    if (context.thread_index() == CTA_SIZE - 1)
+    {
+      carry_value = ldata[K - 1];
+      carry_in    = (flag_bits & (FlagType(1) << (K - 1))) ? false : true;
+      carry_index = num_outputs;
+    }
+  }
+  else
+  {
+    if (context.thread_index() == (n - 1) / K)
+    {
+      for (unsigned int k = 0; k < K; k++)
+          if (k == (n - 1) % K)
+              carry_value = ldata[k];
+      carry_in    = (flag_bits & (FlagType(1) << ((n - 1) % K))) ? false : true;
+      carry_index = num_outputs;
+    }
+  }
+
+  // shuffle values
+  {
+    FlagType position = output_position;
+  
+    for(unsigned int k = 0; k < K; k++)
+    {
+      const unsigned int offset = K * context.thread_index() + k;
+  
+      if (FullBlock || offset < n)
+      {
+        if (flag_bits & (FlagType(1) << k))
+        {
+          sdata[position / CTA_SIZE][position % CTA_SIZE] = ldata[k];
+          position++;
+        }
+      }
+    }
+  }
+
+  context.barrier();
+
+
+  // write values out
+  for(unsigned int k = 0; k < K; k++)
+  {
+    const unsigned int offset = CTA_SIZE * k + context.thread_index();
+
+    if (offset < num_outputs)
+    {
+      OutputIterator2 tmp = ovals + offset;
+      *tmp = sdata[k][context.thread_index()];
+    }
+  }
+
+  context.barrier();
+}
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction,
+          typename FlagIterator,
+          typename IndexIterator,
+          typename ValueIterator,
+          typename BoolIterator,
+          typename Decomposition,
+          typename Context>
+struct reduce_by_key_closure
+{
+  InputIterator1   ikeys;
+  InputIterator2   ivals;
+  OutputIterator1  okeys;
+  OutputIterator2  ovals;
+  BinaryPredicate  binary_pred;
+  BinaryFunction   binary_op;
+  FlagIterator     iflags;
+  IndexIterator    interval_counts;
+  ValueIterator    interval_values;
+  BoolIterator     interval_carry;
+  Decomposition    decomp;
+  Context          context;
+
+  typedef Context context_type;
+
+  reduce_by_key_closure(InputIterator1   ikeys,
+                        InputIterator2   ivals,
+                        OutputIterator1  okeys,
+                        OutputIterator2  ovals,
+                        BinaryPredicate  binary_pred,
+                        BinaryFunction   binary_op,
+                        FlagIterator     iflags,
+                        IndexIterator    interval_counts,
+                        ValueIterator    interval_values,
+                        BoolIterator     interval_carry,
+                        Decomposition    decomp,
+                        Context          context = Context())
+    : ikeys(ikeys), ivals(ivals), okeys(okeys), ovals(ovals), binary_pred(binary_pred), binary_op(binary_op),
+      iflags(iflags), interval_counts(interval_counts), interval_values(interval_values), interval_carry(interval_carry),
+      decomp(decomp), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename thrust::iterator_value<InputIterator1>::type KeyType;
+    typedef typename thrust::iterator_value<ValueIterator>::type  ValueType;
+    typedef typename Decomposition::index_type                    IndexType;
+    typedef typename thrust::iterator_value<FlagIterator>::type   FlagType;
+
+    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
+
+// TODO centralize this mapping (__CUDA_ARCH__ -> smem bytes)
+#if __CUDA_ARCH__ >= 200
+    const unsigned int SMEM = (48 * 1024);
+#else
+    const unsigned int SMEM = (16 * 1024) - 256;
+#endif
+    const unsigned int SMEM_FIXED = CTA_SIZE * sizeof(FlagType) + sizeof(ValueType) + sizeof(IndexType) + sizeof(bool);
+    const unsigned int BOUND_1 = (SMEM - SMEM_FIXED) / ((CTA_SIZE + 1) * sizeof(ValueType));
+    const unsigned int BOUND_2 = 8 * sizeof(FlagType);
+    const unsigned int BOUND_3 = 6;
+  
+    // TODO replace this with a static_min<BOUND_1,BOUND_2,BOUND_3>::value
+    const unsigned int K = (BOUND_1 < BOUND_2) ? (BOUND_1 < BOUND_3 ? BOUND_1 : BOUND_3) : (BOUND_2 < BOUND_3 ? BOUND_2 : BOUND_3);
+  
+    __shared__ detail::uninitialized<FlagType[CTA_SIZE]>         sflag;
+    __shared__ detail::uninitialized<ValueType[K][CTA_SIZE + 1]> sdata;  // padded to avoid bank conflicts
+  
+    __shared__ detail::uninitialized<ValueType> carry_value; // storage for carry in and carry out
+    __shared__ detail::uninitialized<IndexType> carry_index;
+    __shared__ detail::uninitialized<bool>      carry_in; 
+
+    typename Decomposition::range_type interval = decomp[context.block_index()];
+    //thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
+  
+
+    if (context.thread_index() == 0)
+    {
+      carry_in = false; // act as though the previous segment terminated just before us
+  
+      if (context.block_index() == 0)
+      {
+        carry_index = 0;
+      }
+      else
+      {
+        interval_counts += (context.block_index() - 1);
+        carry_index = *interval_counts;
+      }
+    }
+  
+    context.barrier();
+  
+    IndexType base = interval.begin();
+  
+    // advance input and output iterators
+    ikeys  += base;
+    ivals  += base;
+    iflags += base;
+    okeys  += carry_index;
+    ovals  += carry_index;
+  
+    const unsigned int unit_size = K * CTA_SIZE;
+  
+    // process full units
+    while (base + unit_size <= interval.end())
+    {
+      const unsigned int n = unit_size;
+      reduce_by_key_body<CTA_SIZE,K,true>(context, n, ikeys, ivals, okeys, ovals, binary_pred, binary_op, iflags, sflag.get(), sdata.get(), carry_in.get(), carry_index.get(), carry_value.get());
+      base   += unit_size;
+      ikeys  += unit_size;
+      ivals  += unit_size;
+      iflags += unit_size;
+      okeys  += carry_index;
+      ovals  += carry_index;
+    }
+  
+    // process partially full unit at end of input (if necessary)
+    if (base < interval.end())
+    {
+      const unsigned int n = interval.end() - base;
+      reduce_by_key_body<CTA_SIZE,K,false>(context, n, ikeys, ivals, okeys, ovals, binary_pred, binary_op, iflags, sflag.get(), sdata.get(), carry_in.get(), carry_index.get(), carry_value.get());
+    }
+  
+    if (context.thread_index() == 0)
+    {
+      interval_values += context.block_index();
+      interval_carry  += context.block_index();
+      *interval_values = carry_value;
+      *interval_carry  = carry_in;
+    }
+  }
+}; // end reduce_by_key_closure
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+struct DefaultPolicy
+{
+  // typedefs
+  typedef unsigned int                                                       FlagType;
+  typedef typename thrust::iterator_traits<InputIterator1>::difference_type  IndexType;
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type       KeyType;
+  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
+    
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator2 is a "pure" output iterator
+  //   TemporaryType = InputIterator2::value_type
+  // else
+  //   TemporaryType = OutputIterator2::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::has_result_type<BinaryFunction>::value,
+    thrust::detail::result_type<BinaryFunction>,
+    thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator2>::value,
+      thrust::iterator_value<InputIterator2>,
+      thrust::iterator_value<OutputIterator2>
+    >
+  >::type ValueType;
+ 
+  // XXX WAR problem on sm_11
+  // TODO tune this
+  const static unsigned int ThreadsPerBlock = (thrust::detail::is_pod<ValueType>::value) ? 256 : 192;
+
+  DefaultPolicy(InputIterator1 first1, InputIterator1 last1)
+    : decomp(default_decomposition<IndexType>(last1 - first1))
+  {}
+
+  // member variables
+  Decomposition decomp;
+};
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction,
+          typename Policy>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op,
+                Policy policy)
+{
+    typedef typename Policy::FlagType       FlagType;
+    typedef typename Policy::Decomposition  Decomposition;
+    typedef typename Policy::IndexType      IndexType;
+    typedef typename Policy::KeyType        KeyType;
+    typedef typename Policy::ValueType      ValueType;
+
+    // temporary arrays
+    typedef thrust::detail::temporary_array<IndexType,DerivedPolicy> IndexArray;
+    typedef thrust::detail::temporary_array<KeyType,DerivedPolicy>   KeyArray;
+    typedef thrust::detail::temporary_array<ValueType,DerivedPolicy> ValueArray;
+    typedef thrust::detail::temporary_array<bool,DerivedPolicy>      BoolArray;
+
+    Decomposition decomp = policy.decomp;
+
+    // input size
+    IndexType n = keys_last - keys_first;
+
+    if (n == 0)
+      return thrust::make_pair(keys_output, values_output);
+
+    IndexArray interval_counts(exec, decomp.size());
+    ValueArray interval_values(exec, decomp.size());
+    BoolArray  interval_carry(exec, decomp.size());
+
+    // an ode to c++11 auto
+    typedef thrust::counting_iterator<IndexType> CountingIterator;
+    typedef thrust::transform_iterator<
+      tail_flag_functor<FlagType,IndexType,KeyType,BinaryPredicate>,
+      thrust::zip_iterator<
+        thrust::tuple<CountingIterator,InputIterator1,InputIterator1>
+      >
+    > FlagIterator;
+
+    FlagIterator iflag= thrust::make_transform_iterator
+       (thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), keys_first, keys_first + 1)),
+        tail_flag_functor<FlagType,IndexType,KeyType,BinaryPredicate>(n, binary_pred));
+
+    // count number of tail flags per interval
+    thrust::system::cuda::detail::reduce_intervals(exec, iflag, interval_counts.begin(), thrust::plus<IndexType>(), decomp);
+
+    thrust::inclusive_scan(exec,
+                           interval_counts.begin(), interval_counts.end(),
+                           interval_counts.begin(),
+                           thrust::plus<IndexType>());
+ 
+    // determine output size
+    const IndexType N = interval_counts[interval_counts.size() - 1];
+   
+    const static unsigned int ThreadsPerBlock = Policy::ThreadsPerBlock;
+    typedef typename IndexArray::iterator IndexIterator;
+    typedef typename ValueArray::iterator ValueIterator; 
+    typedef typename BoolArray::iterator  BoolIterator;  
+    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
+    typedef reduce_by_key_closure<InputIterator1,InputIterator2,OutputIterator1,OutputIterator2,BinaryPredicate,BinaryFunction,
+                                  FlagIterator,IndexIterator,ValueIterator,BoolIterator,Decomposition,Context> Closure;
+    Closure closure
+      (keys_first,  values_first,
+       keys_output, values_output,
+       binary_pred, binary_op,
+       iflag,
+       interval_counts.begin(),
+       interval_values.begin(),
+       interval_carry.begin(),
+       decomp);
+    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
+   
+    if (decomp.size() > 1)
+    {
+      ValueArray interval_values2(exec, decomp.size());
+      IndexArray interval_counts2(exec, decomp.size());
+      BoolArray  interval_carry2(exec, decomp.size());
+
+      IndexType N2 = 
+      thrust::reduce_by_key
+        (exec,
+         thrust::make_zip_iterator(thrust::make_tuple(interval_counts.begin(), interval_carry.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(interval_counts.end(),   interval_carry.end())),
+         interval_values.begin(),
+         thrust::make_zip_iterator(thrust::make_tuple(interval_counts2.begin(), interval_carry2.begin())),
+         interval_values2.begin(),
+         thrust::equal_to< thrust::tuple<IndexType,bool> >(),
+         binary_op).first
+        -
+        thrust::make_zip_iterator(thrust::make_tuple(interval_counts2.begin(), interval_carry2.begin()));
+    
+      thrust::transform_if
+        (exec,
+         interval_values2.begin(), interval_values2.begin() + N2,
+         thrust::make_permutation_iterator(values_output, interval_counts2.begin()),
+         interval_carry2.begin(),
+         thrust::make_permutation_iterator(values_output, interval_counts2.begin()),
+         binary_op,
+         thrust::identity<bool>());
+    }
+  
+    return thrust::make_pair(keys_output + N, values_output + N); 
+}
+
+} // end namespace reduce_by_key_detail
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                InputIterator1 keys_first, 
+                InputIterator1 keys_last,
+                InputIterator2 values_first,
+                OutputIterator1 keys_output,
+                OutputIterator2 values_output,
+                BinaryPredicate binary_pred,
+                BinaryFunction binary_op)
+{
+  return reduce_by_key_detail::reduce_by_key
+    (exec, 
+     keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op,
+     reduce_by_key_detail::DefaultPolicy<InputIterator1,InputIterator2,OutputIterator1,OutputIterator2,BinaryPredicate,BinaryFunction>(keys_first, keys_last));
+} // end reduce_by_key()
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
diff --git a/compat/thrust/system/cuda/detail/reduce_intervals.h b/compat/thrust/system/cuda/detail/reduce_intervals.h
new file mode 100644
index 0000000..505d136
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce_intervals.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_intervals.h
+ *  \brief CUDA implementations of reduce_intervals algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<DerivedPolicy> &exec,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/reduce_intervals.inl>
+
diff --git a/compat/thrust/system/cuda/detail/reduce_intervals.inl b/compat/thrust/system/cuda/detail/reduce_intervals.inl
new file mode 100644
index 0000000..2381769
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reduce_intervals.inl
@@ -0,0 +1,203 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/detail/minmax.h>
+#include <thrust/system/detail/internal/decompose.h>
+#include <thrust/system/cuda/detail/extern_shared_ptr.h>
+#include <thrust/system/cuda/detail/block/reduce.h>
+#include <thrust/system/cuda/detail/detail/launch_closure.h>
+#include <thrust/system/cuda/detail/detail/launch_calculator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition,
+          typename Context>
+struct commutative_reduce_intervals_closure
+{
+  InputIterator  input;
+  OutputIterator output;
+  BinaryFunction binary_op;
+  Decomposition  decomposition;
+  unsigned int shared_array_size;
+
+  typedef Context context_type;
+  context_type context;
+
+  commutative_reduce_intervals_closure(InputIterator input, OutputIterator output, BinaryFunction binary_op, Decomposition decomposition, unsigned int shared_array_size, Context context = Context())
+    : input(input), output(output), binary_op(binary_op), decomposition(decomposition), shared_array_size(shared_array_size), context(context) {}
+
+  __device__ __thrust_forceinline__
+  void operator()(void)
+  {
+    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+    extern_shared_ptr<OutputType>  shared_array;
+
+    typedef typename Decomposition::index_type index_type;
+   
+    // this block processes results in [range.begin(), range.end())
+    thrust::system::detail::internal::index_range<index_type> range = decomposition[context.block_index()];
+
+    index_type i = range.begin() + context.thread_index();
+      
+    input += i;
+
+    if (range.size() < context.block_dimension())
+    {
+      // compute reduction with the first shared_array_size threads
+      if (context.thread_index() < thrust::min<index_type>(shared_array_size,range.size()))
+      {
+        OutputType sum = *input;
+
+        i     += shared_array_size;
+        input += shared_array_size;
+
+        while (i < range.end())
+        {
+          OutputType val = *input;
+
+          sum = binary_op(sum, val);
+
+          i      += shared_array_size;
+          input  += shared_array_size;
+        }
+
+        shared_array[context.thread_index()] = sum;  
+      }
+    }
+    else
+    {
+      // compute reduction with all blockDim.x threads
+      OutputType sum = *input;
+
+      i     += context.block_dimension();
+      input += context.block_dimension();
+
+      while (i < range.end())
+      {
+        OutputType val = *input;
+
+        sum = binary_op(sum, val);
+
+        i      += context.block_dimension();
+        input  += context.block_dimension();
+      }
+
+      // write first shared_array_size values into shared memory
+      if (context.thread_index() < shared_array_size)
+        shared_array[context.thread_index()] = sum;  
+
+      // accumulate remaining values (if any) to shared memory in stages
+      if (context.block_dimension() > shared_array_size)
+      {
+        unsigned int lb = shared_array_size;
+        unsigned int ub = shared_array_size + lb;
+        
+        while (lb < context.block_dimension())
+        {
+          context.barrier();
+
+          if (lb <= context.thread_index() && context.thread_index() < ub)
+          {
+            OutputType tmp = shared_array[context.thread_index() - lb];
+            shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
+          }
+
+          lb += shared_array_size;
+          ub += shared_array_size;
+        }
+      }
+    }
+  
+    context.barrier();
+
+    block::reduce_n(context, shared_array, thrust::min<index_type>(range.size(), shared_array_size), binary_op);
+  
+    if (context.thread_index() == 0)
+    {
+      output += context.block_index();
+      *output = shared_array[0];
+    }
+  }
+};
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template <typename ExecutionPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<ExecutionPolicy> &,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp)
+{
+  // we're attempting to launch a kernel, assert we're compiling with nvcc
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+
+  if (decomp.size() == 0)
+    return;
+  
+  // TODO if (decomp.size() > deviceProperties.maxGridSize[0]) throw cuda exception (or handle general case)
+
+  typedef detail::blocked_thread_array Context;
+  typedef commutative_reduce_intervals_closure<InputIterator,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
+  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+  
+  detail::launch_calculator<Closure> calculator;
+
+  thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
+
+  //size_t max_blocks = thrust::get<0>(config);
+  size_t block_size = thrust::get<1>(config);
+  size_t max_memory = thrust::get<2>(config);
+
+  // determine shared array size
+  size_t shared_array_size  = thrust::min(max_memory / sizeof(OutputType), block_size);
+  size_t shared_array_bytes = sizeof(OutputType) * shared_array_size;
+  
+  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
+
+  Closure closure(input, output, binary_op, decomp, shared_array_size);
+  detail::launch_closure(closure, decomp.size(), block_size, shared_array_bytes);
+}
+
+__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/remove.h b/compat/thrust/system/cuda/detail/remove.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/remove.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/replace.h b/compat/thrust/system/cuda/detail/replace.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/replace.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/reverse.h b/compat/thrust/system/cuda/detail/reverse.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/reverse.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/runtime_introspection.h b/compat/thrust/system/cuda/detail/runtime_introspection.h
new file mode 100644
index 0000000..39f6c9f
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/runtime_introspection.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file runtime_introspection.h
+ *  \brief Defines the interface to functions
+ *         providing introspection into the architecture
+ *         of CUDA devices.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include this for device_properties_t and function_attributes_t
+#include <thrust/system/cuda/detail/cuda_launch_config.h>
+
+// #include this for size_t
+#include <cstddef>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+/*! Returns the current device ordinal.
+ */
+inline int current_device();
+
+/*! Returns a copy of the device_properties_t structure
+ *  that is associated with a given device.
+ */
+inline device_properties_t device_properties(int device_id);
+
+/*! Returns a copy of the device_properties_t structure
+ *  that is associated with the current device.
+ */
+inline device_properties_t device_properties(void);
+
+/*! Returns a copy of the function_attributes_t structure
+ *  that is associated with a given __global__ function
+ */
+template <typename KernelFunction>
+inline function_attributes_t function_attributes(KernelFunction kernel);
+
+/*! Returns the compute capability of a device in integer format.
+ *  For example, returns 10 for sm_10 and 21 for sm_21
+ *  \return The compute capability as an integer
+ */
+inline size_t compute_capability(const device_properties_t &properties);
+inline size_t compute_capability(void);
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/runtime_introspection.inl>
+
diff --git a/compat/thrust/system/cuda/detail/runtime_introspection.inl b/compat/thrust/system/cuda/detail/runtime_introspection.inl
new file mode 100644
index 0000000..a5cc382
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/runtime_introspection.inl
@@ -0,0 +1,169 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/runtime_introspection.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/detail/util/blocking.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace runtime_introspection_detail
+{
+
+
+inline void get_device_properties(device_properties_t &p, int device_id)
+{
+  cudaDeviceProp properties;
+  
+  cudaError_t error = cudaGetDeviceProperties(&properties, device_id);
+  
+  if(error)
+    throw thrust::system_error(error, thrust::cuda_category());
+
+  // be careful about how this is initialized!
+  device_properties_t temp = {
+    properties.major,
+    {
+      properties.maxGridSize[0],
+      properties.maxGridSize[1],
+      properties.maxGridSize[2]
+    },
+    properties.maxThreadsPerBlock,
+    properties.maxThreadsPerMultiProcessor,
+    properties.minor,
+    properties.multiProcessorCount,
+    properties.regsPerBlock,
+    properties.sharedMemPerBlock,
+    properties.warpSize
+  };
+
+  p = temp;
+} // end get_device_properties()
+
+
+} // end runtime_introspection_detail
+
+
+inline device_properties_t device_properties(int device_id)
+{
+  // cache the result of get_device_properties, because it is slow
+  // only cache the first few devices
+  static const int max_num_devices                              = 16;
+
+  static bool properties_exist[max_num_devices]                 = {0};
+  static device_properties_t device_properties[max_num_devices] = {};
+
+  if(device_id >= max_num_devices)
+  {
+    device_properties_t result;
+    runtime_introspection_detail::get_device_properties(result, device_id);
+    return result;
+  }
+
+  if(!properties_exist[device_id])
+  {
+    runtime_introspection_detail::get_device_properties(device_properties[device_id], device_id);
+
+    // disallow the compiler to move the write to properties_exist[device_id]
+    // before the initialization of device_properties[device_id]
+    __thrust_compiler_fence();
+    
+    properties_exist[device_id] = true;
+  }
+
+  return device_properties[device_id];
+}
+
+inline int current_device()
+{
+  int result = -1;
+
+  cudaError_t error = cudaGetDevice(&result);
+
+  if(error)
+    throw thrust::system_error(error, thrust::cuda_category());
+
+  if(result < 0)
+    throw thrust::system_error(cudaErrorNoDevice, thrust::cuda_category());
+
+  return result;
+}
+
+inline device_properties_t device_properties(void)
+{
+  return device_properties(current_device());
+}
+
+template <typename KernelFunction>
+inline function_attributes_t function_attributes(KernelFunction kernel)
+{
+// cudaFuncGetAttributes(), used below, only exists when __CUDACC__ is defined
+#ifdef __CUDACC__
+  typedef void (*fun_ptr_type)();
+
+  fun_ptr_type fun_ptr = reinterpret_cast<fun_ptr_type>(kernel);
+
+  cudaFuncAttributes attributes;
+  
+  cudaError_t error = cudaFuncGetAttributes(&attributes, fun_ptr);
+  
+  if(error)
+  {
+    throw thrust::system_error(error, thrust::cuda_category());
+  }
+
+  // be careful about how this is initialized!
+  function_attributes_t result = {
+    attributes.constSizeBytes,
+    attributes.localSizeBytes,
+    attributes.maxThreadsPerBlock,
+    attributes.numRegs,
+    attributes.sharedSizeBytes
+  };
+
+  return result;
+#else
+  return function_attributes_t();
+#endif // __CUDACC__
+}
+
+inline size_t compute_capability(const device_properties_t &properties)
+{
+  return 10 * properties.major + properties.minor;
+}
+
+inline size_t compute_capability(void)
+{
+  return compute_capability(device_properties());
+}
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/scan.h b/compat/thrust/system/cuda/detail/scan.h
new file mode 100644
index 0000000..036c89a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/scan.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief Scan operations (parallel prefix-sum) [cuda]
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/scan.inl>
+
diff --git a/compat/thrust/system/cuda/detail/scan.inl b/compat/thrust/system/cuda/detail/scan.inl
new file mode 100644
index 0000000..9d9c6d2
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/scan.inl
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.inl
+ *  \brief Inline file for scan.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/cuda/detail/detail/fast_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                AssociativeOperator binary_op)
+{
+    // we're attempting to launch a kernel, assert we're compiling with nvcc
+    // ========================================================================
+    // X Note to the user: If you've found this line due to a compiler error, X
+    // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+    // ========================================================================
+    THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+    
+    return thrust::system::cuda::detail::detail::fast_scan::inclusive_scan(exec, first, last, result, binary_op);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                AssociativeOperator binary_op)
+{
+    // we're attempting to launch a kernel, assert we're compiling with nvcc
+    // ========================================================================
+    // X Note to the user: If you've found this line due to a compiler error, X
+    // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+    // ========================================================================
+    THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+
+    return thrust::system::cuda::detail::detail::fast_scan::exclusive_scan(exec, first, last, result, init, binary_op);
+}
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/scan_by_key.h b/compat/thrust/system/cuda/detail/scan_by_key.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/scan_by_key.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/scatter.h b/compat/thrust/system/cuda/detail/scatter.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/scatter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/sequence.h b/compat/thrust/system/cuda/detail/sequence.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/sequence.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/set_difference.inl b/compat/thrust/system/cuda/detail/set_difference.inl
new file mode 100644
index 0000000..33d9884
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/set_difference.inl
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/system/cuda/detail/detail/set_operation.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace set_difference_detail
+{
+
+
+struct serial_bounded_set_difference
+{
+  // max_input_size <= 32
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
+  inline __device__
+    thrust::detail::uint32_t operator()(Size max_input_size,
+                                        InputIterator1 first1, InputIterator1 last1,
+                                        InputIterator2 first2, InputIterator2 last2,
+                                        OutputIterator result,
+                                        Compare comp)
+  {
+    thrust::detail::uint32_t active_mask = 0;
+    thrust::detail::uint32_t active_bit = 1;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        *result = *first1;
+        active_mask |= active_bit;
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+      } // end else if
+      else
+      {
+        ++first1;
+        ++first2;
+      } // end else
+  
+      ++result;
+      active_bit <<= 1;
+    } // end while
+
+    while(first1 != last1)
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    }
+  
+    return active_mask;
+  }
+
+
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
+  inline __device__
+    Size count(Size max_input_size,
+               InputIterator1 first1, InputIterator1 last1,
+               InputIterator2 first2, InputIterator2 last2,
+               Compare comp)
+  {
+    Size result = 0;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        ++first1;
+        ++result;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+      } // end else if
+      else
+      {
+        ++first1;
+        ++first2;
+      } // end else
+    } // end while
+  
+    return result + last1 - first1;
+  }
+}; // end serial_bounded_set_difference
+
+
+} // end namespace set_difference_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
+                                     RandomAccessIterator1 first1,
+                                     RandomAccessIterator1 last1,
+                                     RandomAccessIterator2 first2,
+                                     RandomAccessIterator2 last2,
+                                     RandomAccessIterator3 result,
+                                     Compare comp)
+{
+  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_difference_detail::serial_bounded_set_difference());
+} // end set_difference
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/set_intersection.inl b/compat/thrust/system/cuda/detail/set_intersection.inl
new file mode 100644
index 0000000..e4810b6
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/set_intersection.inl
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/system/cuda/detail/detail/set_operation.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace set_intersection_detail
+{
+
+
+struct serial_bounded_set_intersection
+{
+  // max_input_size <= 32
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
+  inline __device__
+    thrust::detail::uint32_t operator()(Size max_input_size,
+                                        InputIterator1 first1, InputIterator1 last1,
+                                        InputIterator2 first2, InputIterator2 last2,
+                                        OutputIterator result,
+                                        Compare comp)
+  {
+    thrust::detail::uint32_t active_mask = 0;
+    thrust::detail::uint32_t active_bit = 1;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+      } // end else if
+      else
+      {
+        *result = *first1;
+        ++first1;
+        ++first2;
+        active_mask |= active_bit;
+      } // end else
+  
+      ++result;
+      active_bit <<= 1;
+    } // end while
+  
+    return active_mask;
+  }
+
+
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
+  inline __device__
+    Size count(Size max_input_size,
+               InputIterator1 first1, InputIterator1 last1,
+               InputIterator2 first2, InputIterator2 last2,
+               Compare comp)
+  {
+    Size result = 0;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+      } // end else if
+      else
+      {
+        ++result;
+        ++first1;
+        ++first2;
+      } // end else
+    } // end while
+  
+    return result;
+  }
+}; // end serial_bounded_set_intersection
+
+
+} // end namespace set_intersection_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
+                                       RandomAccessIterator1 first1,
+                                       RandomAccessIterator1 last1,
+                                       RandomAccessIterator2 first2,
+                                       RandomAccessIterator2 last2,
+                                       RandomAccessIterator3 result,
+                                       Compare comp)
+{
+  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_intersection_detail::serial_bounded_set_intersection());
+} // end set_intersection
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/set_operations.h b/compat/thrust/system/cuda/detail/set_operations.h
new file mode 100644
index 0000000..040e341
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/set_operations.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
+                                     RandomAccessIterator1 first1,
+                                     RandomAccessIterator1 last1,
+                                     RandomAccessIterator2 first2,
+                                     RandomAccessIterator2 last2,
+                                     RandomAccessIterator3 result,
+                                     Compare comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
+                                       RandomAccessIterator1 first1,
+                                       RandomAccessIterator1 last1,
+                                       RandomAccessIterator2 first2,
+                                       RandomAccessIterator2 last2,
+                                       RandomAccessIterator3 result,
+                                       Compare comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
+                                               RandomAccessIterator1 first1,
+                                               RandomAccessIterator1 last1,
+                                               RandomAccessIterator2 first2,
+                                               RandomAccessIterator2 last2,
+                                               RandomAccessIterator3 result,
+                                               Compare comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_union(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator1 first1,
+                                RandomAccessIterator1 last1,
+                                RandomAccessIterator2 first2,
+                                RandomAccessIterator2 last2,
+                                RandomAccessIterator3 result,
+                                Compare comp);
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/set_difference.inl>
+#include <thrust/system/cuda/detail/set_intersection.inl>
+#include <thrust/system/cuda/detail/set_symmetric_difference.inl>
+#include <thrust/system/cuda/detail/set_union.inl>
+
diff --git a/compat/thrust/system/cuda/detail/set_symmetric_difference.inl b/compat/thrust/system/cuda/detail/set_symmetric_difference.inl
new file mode 100644
index 0000000..112c955
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/set_symmetric_difference.inl
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/system/cuda/detail/detail/set_operation.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace set_symmetric_difference_detail
+{
+
+
+struct serial_bounded_set_symmetric_difference
+{
+  // max_input_size <= 32
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
+  inline __device__
+    thrust::detail::uint32_t operator()(Size max_input_size,
+                                        InputIterator1 first1, InputIterator1 last1,
+                                        InputIterator2 first2, InputIterator2 last2,
+                                        OutputIterator result,
+                                        Compare comp)
+  {
+    thrust::detail::uint32_t active_mask = 0;
+    thrust::detail::uint32_t active_bit = 1;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        *result = *first1;
+        active_mask |= active_bit;
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        *result = *first2;
+        active_mask |= active_bit;
+        ++first2;
+      } // end else if
+      else
+      {
+        ++first1;
+        ++first2;
+      } // end else
+  
+      ++result;
+      active_bit <<= 1;
+    } // end while
+
+    while(first1 != last1)
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    }
+
+    while(first2 != last2)
+    {
+      *result = *first2;
+      ++first2;
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    }
+  
+    return active_mask;
+  }
+
+
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
+  inline __device__
+    Size count(Size max_input_size,
+               InputIterator1 first1, InputIterator1 last1,
+               InputIterator2 first2, InputIterator2 last2,
+               Compare comp)
+  {
+    Size result = 0;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        ++first1;
+        ++result;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+        ++result;
+      } // end else if
+      else
+      {
+        ++first1;
+        ++first2;
+      } // end else
+    } // end while
+  
+    return result + thrust::max(last1 - first1,last2 - first2);
+  }
+}; // end serial_bounded_set_symmetric_difference
+
+
+} // end namespace set_symmetric_difference_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
+                                               RandomAccessIterator1 first1,
+                                               RandomAccessIterator1 last1,
+                                               RandomAccessIterator2 first2,
+                                               RandomAccessIterator2 last2,
+                                               RandomAccessIterator3 result,
+                                               Compare comp)
+{
+  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_symmetric_difference_detail::serial_bounded_set_symmetric_difference());
+} // end set_symmetric_difference
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/set_union.inl b/compat/thrust/system/cuda/detail/set_union.inl
new file mode 100644
index 0000000..66cccab
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/set_union.inl
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/system/cuda/detail/detail/set_operation.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+namespace set_union_detail
+{
+
+
+struct serial_bounded_set_union
+{
+  // max_input_size <= 32
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
+  inline __device__
+    thrust::detail::uint32_t operator()(Size max_input_size,
+                                        InputIterator1 first1, InputIterator1 last1,
+                                        InputIterator2 first2, InputIterator2 last2,
+                                        OutputIterator result,
+                                        Compare comp)
+  {
+    thrust::detail::uint32_t active_mask = 0;
+    thrust::detail::uint32_t active_bit = 1;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        *result = *first1;
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        *result = *first2;
+        ++first2;
+      } // end else if
+      else
+      {
+        *result = *first1;
+        ++first1;
+        ++first2;
+      } // end else
+  
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    } // end while
+
+    while(first1 != last1)
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    }
+
+    while(first2 != last2)
+    {
+      *result = *first2;
+      ++first2;
+      ++result;
+      active_mask |= active_bit;
+      active_bit <<= 1;
+    }
+  
+    return active_mask;
+  }
+
+
+  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
+  inline __device__
+    Size count(Size max_input_size,
+               InputIterator1 first1, InputIterator1 last1,
+               InputIterator2 first2, InputIterator2 last2,
+               Compare comp)
+  {
+    Size result = 0;
+  
+    while(first1 != last1 && first2 != last2)
+    {
+      if(comp(*first1,*first2))
+      {
+        ++first1;
+      } // end if
+      else if(comp(*first2,*first1))
+      {
+        ++first2;
+      } // end else if
+      else
+      {
+        ++first1;
+        ++first2;
+      } // end else
+
+      ++result;
+    } // end while
+  
+    return result + thrust::max(last1 - first1,last2 - first2);
+  }
+}; // end serial_bounded_set_union
+
+
+} // end namespace set_union_detail
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2, 
+	 typename RandomAccessIterator3,
+         typename Compare>
+RandomAccessIterator3 set_union(execution_policy<ExecutionPolicy> &exec,
+                                RandomAccessIterator1 first1,
+                                RandomAccessIterator1 last1,
+                                RandomAccessIterator2 first2,
+                                RandomAccessIterator2 last2,
+                                RandomAccessIterator3 result,
+                                Compare comp)
+{
+  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_union_detail::serial_bounded_set_union());
+} // end set_union
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/sort.h b/compat/thrust/system/cuda/detail/sort.h
new file mode 100644
index 0000000..e78d36a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/sort.inl>
+
diff --git a/compat/thrust/system/cuda/detail/sort.inl b/compat/thrust/system/cuda/detail/sort.inl
new file mode 100644
index 0000000..d7e0a60
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/sort.inl
@@ -0,0 +1,287 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file sort.inl
+ *  \brief Inline file for sort.h
+ */
+
+#include <thrust/system/cuda/detail/detail/stable_merge_sort.h>
+#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
+
+#include <thrust/reverse.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/temporary_indirect_permutation.h>
+#include <thrust/detail/trivial_sequence.h>
+
+
+/*
+ *  This file implements the following dispatch procedure for cuda::stable_sort()
+ *  and cuda::stable_sort_by_key(). The first level inspects the KeyType
+ *  and StrictWeakOrdering to determine whether a sort assuming primitive-typed
+ *  data may be applied.
+ *
+ *  If a sort assuming primitive-typed data can be applied (i.e., a radix sort),
+ *  the input ranges are first trivialized (turned into simple contiguous ranges
+ *  if they are not already). To implement descending orderings, an ascending
+ *  sort will be reversed.
+ *
+ *  If a sort assuming primitive-typed data cannot be applied, a comparison-based
+ *  sort is used. Depending on the size of the key and value types, one level of
+ *  indirection may be applied to their input ranges. This transformation
+ *  may be applied to either range to convert an ill-suited problem (i.e. sorting with
+ *  large keys or large value) into a problem more amenable to the underlying
+ *  merge sort algorithm.
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+namespace stable_sort_detail
+{
+
+
+template<typename KeyType, typename StrictWeakCompare>
+  struct can_use_primitive_sort
+    : thrust::detail::and_<
+        thrust::detail::is_arithmetic<KeyType>,
+        thrust::detail::or_<
+          thrust::detail::is_same<StrictWeakCompare,thrust::less<KeyType> >,
+          thrust::detail::is_same<StrictWeakCompare,thrust::greater<KeyType> >
+        >
+      >
+{};
+
+
+template<typename RandomAccessIterator, typename StrictWeakCompare>
+  struct enable_if_primitive_sort
+    : thrust::detail::enable_if<
+        can_use_primitive_sort<
+          typename iterator_value<RandomAccessIterator>::type,
+          StrictWeakCompare
+        >::value
+      >
+{};
+
+
+template<typename RandomAccessIterator, typename StrictWeakCompare>
+  struct enable_if_comparison_sort
+    : thrust::detail::disable_if<
+        can_use_primitive_sort<
+          typename iterator_value<RandomAccessIterator>::type,
+          StrictWeakCompare
+        >::value
+      >
+{};
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  typename enable_if_primitive_sort<RandomAccessIterator,StrictWeakOrdering>::type
+    stable_sort(execution_policy<DerivedPolicy> &exec,
+                RandomAccessIterator first,
+                RandomAccessIterator last,
+                StrictWeakOrdering comp)
+{
+  // ensure sequence has trivial iterators
+  thrust::detail::trivial_sequence<RandomAccessIterator,DerivedPolicy> keys(exec, first, last);
+  
+  // CUDA path for thrust::stable_sort with primitive keys
+  // (e.g. int, float, short, etc.) and a less<T> or greater<T> comparison
+  // method is implemented with a primitive sort
+  thrust::system::cuda::detail::detail::stable_primitive_sort(exec, keys.begin(), keys.end());
+  
+  // copy results back, if necessary
+  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator>::value)
+  {
+    thrust::copy(exec, keys.begin(), keys.end(), first);
+  }
+  
+  // if comp is greater<T> then reverse the keys
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+  
+  if(reverse)
+  {
+    thrust::reverse(first, last);
+  }
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  typename enable_if_comparison_sort<RandomAccessIterator,StrictWeakOrdering>::type
+    stable_sort(execution_policy<DerivedPolicy> &exec,
+                RandomAccessIterator first,
+                RandomAccessIterator last,
+                StrictWeakOrdering comp)
+{
+  // decide whether to sort keys indirectly
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
+  typedef thrust::detail::integral_constant<bool, (sizeof(KeyType) > 8)> use_key_indirection;
+  
+  conditional_temporary_indirect_ordering<use_key_indirection, DerivedPolicy, RandomAccessIterator, StrictWeakOrdering> potentially_indirect_keys(derived_cast(exec), first, last, comp);
+  
+  thrust::system::cuda::detail::detail::stable_merge_sort(exec,
+                                                          potentially_indirect_keys.begin(),
+                                                          potentially_indirect_keys.end(),
+                                                          potentially_indirect_keys.comp());
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  typename enable_if_primitive_sort<RandomAccessIterator1,StrictWeakOrdering>::type
+    stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator1 keys_first,
+                       RandomAccessIterator1 keys_last,
+                       RandomAccessIterator2 values_first,
+                       StrictWeakOrdering comp)
+{
+  // path for thrust::stable_sort_by_key with primitive keys
+  // (e.g. int, float, short, etc.) and a less<T> or greater<T> comparison
+  // method is implemented with stable_primitive_sort_by_key
+  
+  // if comp is greater<T> then reverse the keys and values
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+  
+  // note, we also have to reverse the (unordered) input to preserve stability
+  if (reverse)
+  {
+    thrust::reverse(exec, keys_first,  keys_last);
+    thrust::reverse(exec, values_first, values_first + (keys_last - keys_first));
+  }
+  
+  // ensure sequences have trivial iterators
+  thrust::detail::trivial_sequence<RandomAccessIterator1,DerivedPolicy> keys(exec, keys_first, keys_last);
+  thrust::detail::trivial_sequence<RandomAccessIterator2,DerivedPolicy> values(exec, values_first, values_first + (keys_last - keys_first));
+  
+  thrust::system::cuda::detail::detail::stable_primitive_sort_by_key(exec, keys.begin(), keys.end(), values.begin());
+  
+  // copy results back, if necessary
+  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator1>::value)
+      thrust::copy(exec, keys.begin(), keys.end(), keys_first);
+  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator2>::value)
+      thrust::copy(exec, values.begin(), values.end(), values_first);
+  
+  if (reverse)
+  {
+    thrust::reverse(exec, keys_first,  keys_last);
+    thrust::reverse(exec, values_first, values_first + (keys_last - keys_first));
+  }
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  typename enable_if_comparison_sort<RandomAccessIterator1,StrictWeakOrdering>::type
+    stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       RandomAccessIterator1 keys_first,
+                       RandomAccessIterator1 keys_last,
+                       RandomAccessIterator2 values_first,
+                       StrictWeakOrdering comp)
+{
+  // decide whether to apply indirection to either range
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
+  
+  typedef thrust::detail::integral_constant<bool, (sizeof(KeyType) > 8)> use_key_indirection;
+  typedef thrust::detail::integral_constant<bool, (sizeof(ValueType) > 4)> use_value_indirection;
+  
+  conditional_temporary_indirect_ordering<
+    use_key_indirection,
+    DerivedPolicy,
+    RandomAccessIterator1,
+    StrictWeakOrdering
+  > potentially_indirect_keys(derived_cast(exec), keys_first, keys_last, comp);
+  
+  conditional_temporary_indirect_permutation<
+    use_value_indirection,
+    DerivedPolicy,
+    RandomAccessIterator2
+  > potentially_indirect_values(derived_cast(exec), values_first, values_first + (keys_last - keys_first));
+  
+  thrust::system::cuda::detail::detail::stable_merge_sort_by_key(exec,
+                                                                 potentially_indirect_keys.begin(),
+                                                                 potentially_indirect_keys.end(),
+                                                                 potentially_indirect_values.begin(),
+                                                                 potentially_indirect_keys.comp());
+}
+
+
+} // end namespace stable_sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  // we're attempting to launch a kernel, assert we're compiling with nvcc
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+  
+  stable_sort_detail::stable_sort(exec, first, last, comp);
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  // we're attempting to launch a kernel, assert we're compiling with nvcc
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
+  
+  stable_sort_detail::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
+}
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/swap_ranges.h b/compat/thrust/system/cuda/detail/swap_ranges.h
new file mode 100644
index 0000000..9b1949e
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/swap_ranges.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cuda has no special swap_ranges
+
diff --git a/compat/thrust/system/cuda/detail/synchronize.h b/compat/thrust/system/cuda/detail/synchronize.h
new file mode 100644
index 0000000..762f4a3
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/synchronize.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+inline void synchronize(const char *message = "");
+
+inline void synchronize_if_enabled(const char *message = "");
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/synchronize.inl>
+
diff --git a/compat/thrust/system/cuda/detail/synchronize.inl b/compat/thrust/system/cuda/detail/synchronize.inl
new file mode 100644
index 0000000..5f70f79
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/synchronize.inl
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/synchronize.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+void synchronize(const char *message)
+{
+  cudaError_t error = cudaThreadSynchronize();
+  if(error)
+  {
+    throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message);
+  } // end if
+} // end synchronize()
+
+void synchronize_if_enabled(const char *message)
+{
+// XXX this could potentially be a runtime decision
+#if __THRUST_SYNCHRONOUS
+  synchronize(message);
+#else
+  // WAR "unused parameter" warning
+  (void) message;
+#endif
+} // end synchronize_if_enabled()
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/tabulate.h b/compat/thrust/system/cuda/detail/tabulate.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/tabulate.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/temporary_buffer.h b/compat/thrust/system/cuda/detail/temporary_buffer.h
new file mode 100644
index 0000000..628bd75
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h b/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h
new file mode 100644
index 0000000..3d05f44
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/sequence.h>
+#include <thrust/gather.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  struct temporary_indirect_permutation
+{
+  private:
+    typedef unsigned int size_type;
+    typedef thrust::detail::temporary_array<size_type, DerivedPolicy> array_type;
+
+  public:
+    temporary_indirect_permutation(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
+      : m_exec(exec),
+        m_src_first(first),
+        m_src_last(last),
+        m_permutation(0, m_exec, last - first)
+    {
+      // generate sorted index sequence
+      thrust::sequence(exec, m_permutation.begin(), m_permutation.end());
+    }
+
+    ~temporary_indirect_permutation()
+    {
+      // permute the source array using the indices
+      typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+      thrust::detail::temporary_array<value_type, DerivedPolicy> temp(m_exec, m_src_first, m_src_last);
+      thrust::gather(m_exec, m_permutation.begin(), m_permutation.end(), temp.begin(), m_src_first);
+    }
+
+    typedef typename array_type::iterator iterator;
+
+    iterator begin()
+    {
+      return m_permutation.begin();
+    }
+
+    iterator end()
+    {
+      return m_permutation.end();
+    }
+
+  private:
+    DerivedPolicy &m_exec;
+    RandomAccessIterator m_src_first, m_src_last;
+    thrust::detail::temporary_array<size_type, DerivedPolicy> m_permutation;
+};
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator>
+  struct iterator_range_with_execution_policy
+{
+  iterator_range_with_execution_policy(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
+    : m_exec(exec), m_first(first), m_last(last)
+  {}
+
+  typedef RandomAccessIterator iterator;
+
+  iterator begin()
+  {
+    return m_first;
+  }
+
+  iterator end()
+  {
+    return m_last;
+  }
+
+  DerivedPolicy &exec()
+  {
+    return m_exec;
+  }
+
+  DerivedPolicy &m_exec;
+  RandomAccessIterator m_first, m_last;
+};
+
+
+template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator>
+  struct conditional_temporary_indirect_permutation
+    : thrust::detail::eval_if<
+        Condition::value,
+        thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
+        thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
+      >::type
+{
+  typedef typename thrust::detail::eval_if<
+    Condition::value,
+    thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
+    thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
+  >::type super_t;
+
+  conditional_temporary_indirect_permutation(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
+    : super_t(exec, first, last)
+  {}
+};
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
+  struct temporary_indirect_ordering
+    : temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator>
+{
+  private:
+    typedef temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator> super_t;
+
+  public:
+    temporary_indirect_ordering(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
+      : super_t(exec, first, last),
+        m_comp(first, comp)
+    {}
+
+    struct compare
+    {
+      RandomAccessIterator first;
+
+      thrust::detail::host_device_function<
+        Compare,
+        bool
+      > comp;
+
+      compare(RandomAccessIterator first, Compare comp)
+        : first(first), comp(comp)
+      {}
+
+      template<typename Integral>
+      __host__ __device__
+      bool operator()(Integral a, Integral b)
+      {
+        return comp(first[a], first[b]);
+      }
+    };
+
+    compare comp() const
+    {
+      return m_comp;
+    }
+
+  private:
+    compare m_comp;
+};
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
+  struct iterator_range_with_execution_policy_and_compare
+    : iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator>
+{
+  typedef iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> super_t;
+
+  iterator_range_with_execution_policy_and_compare(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
+    : super_t(exec, first, last), m_comp(comp)
+  {}
+
+  typedef Compare compare;
+
+  compare comp()
+  {
+    return m_comp;
+  }
+
+  Compare m_comp;
+};
+
+
+template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
+  struct conditional_temporary_indirect_ordering
+    : thrust::detail::eval_if<
+        Condition::value,
+        thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
+        thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
+      >::type
+{
+  typedef typename thrust::detail::eval_if<
+    Condition::value,
+    thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
+    thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
+  >::type super_t;
+
+  conditional_temporary_indirect_ordering(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
+    : super_t(exec, first, last, comp)
+  {}
+};
+
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/detail/transform.h b/compat/thrust/system/cuda/detail/transform.h
new file mode 100644
index 0000000..0af8705
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/transform.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// cuda has no special transform
+
diff --git a/compat/thrust/system/cuda/detail/transform_reduce.h b/compat/thrust/system/cuda/detail/transform_reduce.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/transform_reduce.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/transform_scan.h b/compat/thrust/system/cuda/detail/transform_scan.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/transform_scan.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/trivial_copy.h b/compat/thrust/system/cuda/detail/trivial_copy.h
new file mode 100644
index 0000000..e0e898a
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/trivial_copy.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2>
+  void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
+                      RandomAccessIterator1 first,
+                      Size n,
+                      RandomAccessIterator2 result);
+
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2>
+  void trivial_copy_n(cross_system<System1,System2> &exec,
+                      RandomAccessIterator1 first,
+                      Size n,
+                      RandomAccessIterator2 result);
+
+} // end detail
+} // end cuda
+} // end system
+} // end thrust
+
+#include <thrust/system/cuda/detail/trivial_copy.inl>
+
diff --git a/compat/thrust/system/cuda/detail/trivial_copy.inl b/compat/thrust/system/cuda/detail/trivial_copy.inl
new file mode 100644
index 0000000..d23a4ef
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/trivial_copy.inl
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/trivial_copy.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+namespace trivial_copy_detail
+{
+
+inline void checked_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
+{
+  cudaError_t error = cudaMemcpy(dst,src,count,kind);
+  if(error)
+  {
+    throw thrust::system_error(error, thrust::cuda_category());
+  } // end error
+} // end checked_cudaMemcpy()
+
+
+template<typename System1,
+         typename System2>
+  cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy<System1> &,
+                                  const thrust::cpp::execution_policy<System2> &)
+{
+  return cudaMemcpyDeviceToHost;
+} // end cuda_memcpy_kind()
+
+
+template<typename System1,
+         typename System2>
+  cudaMemcpyKind cuda_memcpy_kind(const thrust::cpp::execution_policy<System1> &,
+                                  const thrust::cuda::execution_policy<System2> &)
+{
+  return cudaMemcpyHostToDevice;
+} // end cuda_memcpy_kind()
+
+
+} // end namespace trivial_copy_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2>
+  void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
+                      RandomAccessIterator1 first,
+                      Size n,
+                      RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
+
+  void *dst = thrust::raw_pointer_cast(&*result);
+  const void *src = thrust::raw_pointer_cast(&*first);
+
+  trivial_copy_detail::checked_cudaMemcpy(dst, src, n * sizeof(T), cudaMemcpyDeviceToDevice);
+}
+
+
+template<typename System1,
+         typename System2,
+         typename RandomAccessIterator1,
+         typename Size,
+         typename RandomAccessIterator2>
+  void trivial_copy_n(cross_system<System1,System2> &systems,
+                      RandomAccessIterator1 first,
+                      Size n,
+                      RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
+
+  void *dst = thrust::raw_pointer_cast(&*result);
+  const void *src = thrust::raw_pointer_cast(&*first);
+
+  cudaMemcpyKind kind = trivial_copy_detail::cuda_memcpy_kind(thrust::detail::derived_cast(systems.system1), thrust::detail::derived_cast(systems.system2));
+
+  trivial_copy_detail::checked_cudaMemcpy(dst, src, n * sizeof(T), kind);
+}
+
+
+} // end namespace detail
+} // end namespace cuda
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/cuda/detail/uninitialized_copy.h b/compat/thrust/system/cuda/detail/uninitialized_copy.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/uninitialized_copy.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/uninitialized_fill.h b/compat/thrust/system/cuda/detail/uninitialized_fill.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/uninitialized_fill.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/unique.h b/compat/thrust/system/cuda/detail/unique.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/unique.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/unique_by_key.h b/compat/thrust/system/cuda/detail/unique_by_key.h
new file mode 100644
index 0000000..a307fc5
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/unique_by_key.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special version of this algorithm 
+
diff --git a/compat/thrust/system/cuda/detail/vector.inl b/compat/thrust/system/cuda/detail/vector.inl
new file mode 100644
index 0000000..3659876
--- /dev/null
+++ b/compat/thrust/system/cuda/detail/vector.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/vector.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector()
+      : super_t()
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n)
+      : super_t(n)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n, const value_type &value)
+      : super_t(n,value)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(const vector &x)
+      : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const std::vector<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename InputIterator>
+    vector<T,Allocator>
+      ::vector(InputIterator first, InputIterator last)
+        : super_t(first,last)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+      
+} // end cuda
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/error.h b/compat/thrust/system/cuda/error.h
new file mode 100644
index 0000000..8d09853
--- /dev/null
+++ b/compat/thrust/system/cuda/error.h
@@ -0,0 +1,186 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file thrust/system/cuda/error.h
+ *  \brief CUDA-specific error reporting
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/error_code.h>
+#include <driver_types.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace cuda
+{
+
+/*! \addtogroup system
+ *  \{
+ */
+
+// To construct an error_code after a CUDA Runtime error:
+//
+//   error_code(::cudaGetLastError(), cuda_category())
+
+// XXX N3000 prefers enum class errc { ... }
+namespace errc
+{
+
+/*! \p errc_t enumerates the kinds of CUDA Runtime errors.
+ */
+enum errc_t
+{
+  // from cuda/include/driver_types.h
+  // mirror their order
+  success                            = cudaSuccess,
+  missing_configuration              = cudaErrorMissingConfiguration,
+  memory_allocation                  = cudaErrorMemoryAllocation,
+  initialization_error               = cudaErrorInitializationError,
+  launch_failure                     = cudaErrorLaunchFailure,
+  prior_launch_failure               = cudaErrorPriorLaunchFailure,
+  launch_timeout                     = cudaErrorLaunchTimeout,
+  launch_out_of_resources            = cudaErrorLaunchOutOfResources,
+  invalid_device_function            = cudaErrorInvalidDeviceFunction,
+  invalid_configuration              = cudaErrorInvalidConfiguration,
+  invalid_device                     = cudaErrorInvalidDevice,
+  invalid_value                      = cudaErrorInvalidValue,
+  invalid_pitch_value                = cudaErrorInvalidPitchValue,
+  invalid_symbol                     = cudaErrorInvalidSymbol,
+  map_buffer_object_failed           = cudaErrorMapBufferObjectFailed,
+  unmap_buffer_object_failed         = cudaErrorUnmapBufferObjectFailed,
+  invalid_host_pointer               = cudaErrorInvalidHostPointer,
+  invalid_device_pointer             = cudaErrorInvalidDevicePointer,
+  invalid_texture                    = cudaErrorInvalidTexture,
+  invalid_texture_binding            = cudaErrorInvalidTextureBinding,
+  invalid_channel_descriptor         = cudaErrorInvalidChannelDescriptor,
+  invalid_memcpy_direction           = cudaErrorInvalidMemcpyDirection,
+  address_of_constant_error          = cudaErrorAddressOfConstant,
+  texture_fetch_failed               = cudaErrorTextureFetchFailed,
+  texture_not_bound                  = cudaErrorTextureNotBound,
+  synchronization_error              = cudaErrorSynchronizationError,
+  invalid_filter_setting             = cudaErrorInvalidFilterSetting,
+  invalid_norm_setting               = cudaErrorInvalidNormSetting,
+  mixed_device_execution             = cudaErrorMixedDeviceExecution,
+  cuda_runtime_unloading             = cudaErrorCudartUnloading,
+  unknown                            = cudaErrorUnknown,
+  not_yet_implemented                = cudaErrorNotYetImplemented,
+  memory_value_too_large             = cudaErrorMemoryValueTooLarge,
+  invalid_resource_handle            = cudaErrorInvalidResourceHandle,
+  not_ready                          = cudaErrorNotReady,
+  insufficient_driver                = cudaErrorInsufficientDriver,
+  set_on_active_process_error        = cudaErrorSetOnActiveProcess,
+  no_device                          = cudaErrorNoDevice,
+  ecc_uncorrectable                  = cudaErrorECCUncorrectable,
+
+#if CUDART_VERSION >= 4020
+  shared_object_symbol_not_found     = cudaErrorSharedObjectSymbolNotFound,
+  shared_object_init_failed          = cudaErrorSharedObjectInitFailed,
+  unsupported_limit                  = cudaErrorUnsupportedLimit,
+  duplicate_variable_name            = cudaErrorDuplicateVariableName,
+  duplicate_texture_name             = cudaErrorDuplicateTextureName,
+  duplicate_surface_name             = cudaErrorDuplicateSurfaceName,
+  devices_unavailable                = cudaErrorDevicesUnavailable,
+  invalid_kernel_image               = cudaErrorInvalidKernelImage,
+  no_kernel_image_for_device         = cudaErrorNoKernelImageForDevice,
+  incompatible_driver_context        = cudaErrorIncompatibleDriverContext,
+  peer_access_already_enabled        = cudaErrorPeerAccessAlreadyEnabled,
+  peer_access_not_enabled            = cudaErrorPeerAccessNotEnabled,
+  device_already_in_use              = cudaErrorDeviceAlreadyInUse,
+  profiler_disabled                  = cudaErrorProfilerDisabled,
+  assert_triggered                   = cudaErrorAssert,
+  too_many_peers                     = cudaErrorTooManyPeers,
+  host_memory_already_registered     = cudaErrorHostMemoryAlreadyRegistered,
+  host_memory_not_registered         = cudaErrorHostMemoryNotRegistered,
+  operating_system_error             = cudaErrorOperatingSystem,
+#endif
+
+#if CUDART_VERSION >= 5000
+  peer_access_unsupported            = cudaErrorPeerAccessUnsupported,
+  launch_max_depth_exceeded          = cudaErrorLaunchMaxDepthExceeded,
+  launch_file_scoped_texture_used    = cudaErrorLaunchFileScopedTex,
+  launch_file_scoped_surface_used    = cudaErrorLaunchFileScopedSurf,
+  sync_depth_exceeded                = cudaErrorSyncDepthExceeded,
+  attempted_operation_not_permitted  = cudaErrorNotPermitted,
+  attempted_operation_not_supported  = cudaErrorNotSupported,
+#endif
+
+  startup_failure                    = cudaErrorStartupFailure
+}; // end errc_t
+
+
+} // end namespace errc
+
+} // end namespace cuda
+
+/*! \return A reference to an object of a type derived from class \p thrust::error_category.
+ *  \note The object's \p equivalent virtual functions shall behave as specified
+ *        for the class \p thrust::error_category. The object's \p name virtual function shall
+ *        return a pointer to the string <tt>"cuda"</tt>. The object's
+ *        \p default_error_condition virtual function shall behave as follows:
+ *
+ *        If the argument <tt>ev</tt> corresponds to a CUDA error value, the function
+ *        shall return <tt>error_condition(ev,cuda_category())</tt>.
+ *        Otherwise, the function shall return <tt>system_category.default_error_condition(ev)</tt>.
+ */
+inline const error_category &cuda_category(void);
+
+
+// XXX N3000 prefers is_error_code_enum<cuda::errc>
+
+/*! Specialization of \p is_error_code_enum for \p cuda::errc::errc_t
+ */
+template<> struct is_error_code_enum<cuda::errc::errc_t> : thrust::detail::true_type {};
+
+
+// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
+/*! \return <tt>error_code(static_cast<int>(e), cuda::error_category())</tt>
+ */
+inline error_code make_error_code(cuda::errc::errc_t e);
+
+
+// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
+/*! \return <tt>error_condition(static_cast<int>(e), cuda::error_category())</tt>.
+ */
+inline error_condition make_error_condition(cuda::errc::errc_t e);
+
+/*! \} // end system
+ */
+
+
+} // end system
+
+namespace cuda
+{
+
+// XXX replace with using system::cuda_errc upon c++0x
+namespace errc = system::cuda::errc;
+
+} // end cuda
+
+using system::cuda_category;
+
+} // end namespace thrust
+
+#include <thrust/system/cuda/detail/error.inl>
+
diff --git a/compat/thrust/system/cuda/execution_policy.h b/compat/thrust/system/cuda/execution_policy.h
new file mode 100644
index 0000000..bbd33de
--- /dev/null
+++ b/compat/thrust/system/cuda/execution_policy.h
@@ -0,0 +1,165 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/cuda/execution_policy.h
+ *  \brief Execution policies for Thrust's CUDA system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/cuda/detail/par.h>
+
+// now get all the algorithm defintitions
+
+// the order of the following #includes seems to matter, unfortunately
+
+// primitives come first, in order of increasing sophistication
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/assign_value.h>
+#include <thrust/system/cuda/detail/iter_swap.h>
+
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/system/cuda/detail/scan.h>
+#include <thrust/system/cuda/detail/sort.h>
+
+// these are alphabetical
+#include <thrust/system/cuda/detail/adjacent_difference.h>
+#include <thrust/system/cuda/detail/assign_value.h>
+#include <thrust/system/cuda/detail/binary_search.h>
+#include <thrust/system/cuda/detail/copy_if.h>
+#include <thrust/system/cuda/detail/count.h>
+#include <thrust/system/cuda/detail/equal.h>
+#include <thrust/system/cuda/detail/extrema.h>
+#include <thrust/system/cuda/detail/fill.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/cuda/detail/gather.h>
+#include <thrust/system/cuda/detail/generate.h>
+#include <thrust/system/cuda/detail/inner_product.h>
+#include <thrust/system/cuda/detail/iter_swap.h>
+#include <thrust/system/cuda/detail/logical.h>
+#include <thrust/system/cuda/detail/malloc_and_free.h>
+#include <thrust/system/cuda/detail/merge.h>
+#include <thrust/system/cuda/detail/mismatch.h>
+#include <thrust/system/cuda/detail/partition.h>
+#include <thrust/system/cuda/detail/reduce_by_key.h>
+#include <thrust/system/cuda/detail/remove.h>
+#include <thrust/system/cuda/detail/replace.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/cuda/detail/scan_by_key.h>
+#include <thrust/system/cuda/detail/scatter.h>
+#include <thrust/system/cuda/detail/sequence.h>
+#include <thrust/system/cuda/detail/set_operations.h>
+#include <thrust/system/cuda/detail/sort.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/tabulate.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/cuda/detail/transform_reduce.h>
+#include <thrust/system/cuda/detail/transform_scan.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/uninitialized_fill.h>
+#include <thrust/system/cuda/detail/unique.h>
+#include <thrust/system/cuda/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::cuda::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's CUDA backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p cuda::tag is a type representing Thrust's CUDA backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p cuda system.
+ */
+struct tag : thrust::system::cuda::execution_policy<tag> { unspecified };
+
+
+/*! \p thrust::cuda::par is the parallel execution policy associated with Thrust's CUDA
+ *  backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's CUDA backend system by providing \p thrust::cuda::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::cuda::vector.
+ *
+ *  The type of \p thrust::cuda::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::cuda::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the CUDA backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/cuda/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::cuda::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end cuda
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/compat/thrust/system/cuda/experimental/pinned_allocator.h b/compat/thrust/system/cuda/experimental/pinned_allocator.h
new file mode 100644
index 0000000..5294659
--- /dev/null
+++ b/compat/thrust/system/cuda/experimental/pinned_allocator.h
@@ -0,0 +1,239 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/experimental/pinned_allocator.h
+ *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <stdexcept>
+#include <limits>
+#include <string>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace cuda
+{
+
+namespace experimental
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_management_classes
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ *  \see http://www.sgi.com/tech/stl/Allocators.html
+ */
+template<typename T> class pinned_allocator;
+
+template<>
+  class pinned_allocator<void>
+{
+  public:
+    typedef void           value_type;
+    typedef void       *   pointer;
+    typedef const void *   const_pointer;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<void> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+}; // end pinned_allocator
+
+
+template<typename T>
+  class pinned_allocator
+{
+  public:
+    typedef T              value_type;
+    typedef T*             pointer;
+    typedef const T*       const_pointer;
+    typedef T&             reference;
+    typedef const T&       const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<T> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+
+    /*! \p pinned_allocator's null constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator() {}
+
+    /*! \p pinned_allocator's null destructor does nothing.
+     */
+    __host__ __device__
+    inline ~pinned_allocator() {}
+
+    /*! \p pinned_allocator's copy constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator const &) {}
+
+    /*! This version of \p pinned_allocator's copy constructor
+     *  is templated on the \c value_type of the \p pinned_allocator
+     *  to copy from.  It is provided merely for convenience; it
+     *  does nothing.
+     */
+    template<typename U>
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator<U> const &) {}
+
+    /*! This method returns the address of a \c reference of
+     *  interest.
+     *
+     *  \p r The \c reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! This method returns the address of a \c const_reference
+     *  of interest.
+     *
+     *  \p r The \c const_reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! This method allocates storage for objects in pinned host
+     *  memory.
+     *
+     *  \p cnt The number of objects to allocate.
+     *  \return a \c pointer to the newly allocated objects.
+     *  \note This method does not invoke \p value_type's constructor.
+     *        It is the responsibility of the caller to initialize the
+     *        objects at the returned \c pointer. 
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = 0)
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      pointer result(0);
+      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
+
+      if(error)
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      return result;
+    } // end allocate()
+
+    /*! This method deallocates pinned host memory previously allocated
+     *  with this \c pinned_allocator.
+     *
+     *  \p p A \c pointer to the previously allocated memory.
+     *  \p cnt The number of objects previously allocated at
+     *         \p p.
+     *  \note This method does not invoke \p value_type's destructor.
+     *        It is the responsibility of the caller to destroy
+     *        the objects stored at \p p.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type cnt)
+    {
+      cudaError_t error = cudaFreeHost(p);
+      
+      if(error)
+      {
+        throw thrust::system_error(error, thrust::cuda_category());
+      } // end if
+    } // end deallocate()
+
+    /*! This method returns the maximum size of the \c cnt parameter
+     *  accepted by the \p allocate() method.
+     *
+     *  \return The maximum number of objects that may be allocated
+     *          by a single call to \p allocate().
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! This method tests this \p pinned_allocator for equality to
+     *  another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c true.
+     */
+    __host__ __device__
+    inline bool operator==(pinned_allocator const& x) { return true; }
+
+    /*! This method tests this \p pinned_allocator for inequality
+     *  to another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c false.
+     */
+    __host__ __device__
+    inline bool operator!=(pinned_allocator const &x) { return !operator==(x); }
+}; // end pinned_allocator
+
+/*! \}
+ */
+
+} // end experimental
+
+} // end cuda
+
+} // end system
+
+// alias cuda's members at top-level
+namespace cuda
+{
+
+namespace experimental
+{
+
+using thrust::system::cuda::experimental::pinned_allocator;
+
+} // end experimental
+
+} // end cuda
+
+} // end thrust
+
diff --git a/compat/thrust/system/cuda/memory.h b/compat/thrust/system/cuda/memory.h
new file mode 100644
index 0000000..368eea2
--- /dev/null
+++ b/compat/thrust/system/cuda/memory.h
@@ -0,0 +1,421 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/memory.h
+ *  \brief Managing memory associated with Thrust's CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace cuda
+{
+
+template<typename> class pointer;
+
+} // end cuda
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize std::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace std
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::cuda::pointer<Element> >
+{
+  private:
+    typedef thrust::system::cuda::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end std
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::cuda
+ *  \brief \p thrust::system::cuda is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's CUDA backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace cuda
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::cuda::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+#if 0
+/*! \p cuda::tag is type representing Thrust's CUDA backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p cuda system.
+ */
+struct tag { unspecified };
+#endif
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the cuda system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in cuda memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p cuda::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cuda::malloc
+ *  \see cuda::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::cuda::tag,
+               thrust::system::cuda::reference<T>,
+               thrust::system::cuda::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::cuda::tag,
+      //thrust::system::cuda::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::cuda::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p tbb system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p cuda system.
+ *  \p reference is the type of the result of dereferencing a \p cuda::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::cuda::pointer<T>,
+               thrust::system::cuda::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::cuda::pointer<T>,
+      thrust::system::cuda::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    __host__ __device__
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference ot interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+/*! Allocates an area of memory available to Thrust's <tt>cuda</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>cuda::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>cuda::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cuda::pointer<void></tt> returned by this function must be
+ *        deallocated with \p cuda::free.
+ *  \see cuda::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>cuda</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>cuda::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>cuda::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>cuda::pointer<T></tt> returned by this function must be
+ *        deallocated with \p cuda::free.
+ *  \see cuda::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>cuda::malloc</tt>.
+ *  \param ptr A <tt>cuda::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>cuda::malloc</tt>.
+ *  \see cuda::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+// XXX upon c++11
+// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
+
+/*! \p cuda::allocator is the default allocator used by the \p cuda system's containers such as
+ *  <tt>cuda::vector</tt> if no user-specified allocator is provided. \p cuda::allocator allocates
+ *  (deallocates) storage with \p cuda::malloc (\p cuda::free).
+ */
+template<typename T>
+  struct allocator
+    : thrust::detail::malloc_allocator<
+        T,
+        tag,
+        pointer<T>
+      >
+{
+  /*! The \p rebind metafunction provides the type of an \p allocator
+   *  instantiated with another type.
+   *
+   *  \tparam U The other type to use for instantiation.
+   */
+  template<typename U>
+    struct rebind
+  {
+    /*! The typedef \p other gives the type of the rebound \p allocator.
+     */
+    typedef allocator<U> other;
+  };
+
+  /*! No-argument constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator() {}
+
+  /*! Copy constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator(const allocator &) {}
+
+  /*! Constructor from other \p allocator has no effect.
+   */
+  template<typename U>
+  __host__ __device__
+  inline allocator(const allocator<U> &) {}
+
+  /*! Destructor has no effect.
+   */
+  __host__ __device__
+  inline ~allocator() {}
+}; // end allocator
+
+} // end cuda
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::cuda
+ *  \brief \p thrust::cuda is a top-level alias for thrust::system::cuda.
+ */
+namespace cuda
+{
+
+using thrust::system::cuda::pointer;
+using thrust::system::cuda::reference;
+using thrust::system::cuda::malloc;
+using thrust::system::cuda::free;
+using thrust::system::cuda::allocator;
+
+} // end cuda
+
+} // end thrust
+
+#include <thrust/system/cuda/detail/memory.inl>
+
diff --git a/compat/thrust/system/cuda/vector.h b/compat/thrust/system/cuda/vector.h
new file mode 100644
index 0000000..ac47a84
--- /dev/null
+++ b/compat/thrust/system/cuda/vector.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ccudaliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+template<typename T, typename Allocator> class host_vector;
+
+namespace system
+{
+namespace cuda
+{
+
+// XXX upon c++11
+// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
+
+/*! \p cuda::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p cuda::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p cuda::vector reside in memory
+ *  available to the \p cuda system.
+ *
+ *  \tparam T The element type of the \p cuda::vector.
+ *  \tparam Allocator The allocator type of the \p cuda::vector. Defaults to \p cuda::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cuda::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+  class vector
+    : public thrust::detail::vector_base<T,Allocator>
+{
+  /*! \cond
+   */
+  private:
+    typedef thrust::detail::vector_base<T,Allocator> super_t;
+  /*! \endcond
+   */
+
+  public:
+
+  /*! \cond
+   */
+    typedef typename super_t::size_type  size_type;
+    typedef typename super_t::value_type value_type;
+  /*! \endcond
+   */
+
+    /*! This constructor creates an empty \p cuda::vector.
+     */
+    vector();
+
+    /*! This constructor creates a \p cuda::vector with \p n default-constructed elements.
+     *  \param n The size of the \p cuda::vector to create.
+     */
+    explicit vector(size_type n);
+
+    /*! This constructor creates a \p cuda::vector with \p n copies of \p value.
+     *  \param n The size of the \p cuda::vector to create.
+     *  \param value An element to copy.
+     */
+    explicit vector(size_type n, const value_type &value);
+
+    /*! Copy constructor copies from another \p cuda::vector.
+     *  \param x The other \p cuda::vector to copy.
+     */
+    vector(const vector &x);
+
+    /*! This constructor copies from another Thrust vector-like object.
+     *  \param x The other object to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+
+    /*! This constructor copies from a \c std::vector.
+     *  \param x The \c std::vector to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! This constructor creates a \p cuda::vector by copying from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+
+    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
+    //
+    /*! Assignment operator assigns from a \c std::vector.
+     *  \param x The \c std::vector to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! Assignment operator assigns from another Thrust vector-like object.
+     *  \param x The other object to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+}; // end vector
+
+} // end cuda
+} // end system
+
+// alias system::cuda names at top-level
+namespace cuda
+{
+
+using thrust::system::cuda::vector;
+
+} // end cuda
+
+} // end thrust
+
+#include <thrust/system/cuda/detail/vector.inl>
+
diff --git a/compat/thrust/system/detail/adl/adjacent_difference.h b/compat/thrust/system/detail/adl/adjacent_difference.h
new file mode 100644
index 0000000..246c116
--- /dev/null
+++ b/compat/thrust/system/detail/adl/adjacent_difference.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the adjacent_difference.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch adjacent_difference
+
+#define __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/adjacent_difference.h>
+#include __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+#undef __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/adjacent_difference.h>
+#include __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/assign_value.h b/compat/thrust/system/detail/adl/assign_value.h
new file mode 100644
index 0000000..b5c588a
--- /dev/null
+++ b/compat/thrust/system/detail/adl/assign_value.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the assign_value.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch assign_value
+
+#define __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/assign_value.h>
+#include __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
+#undef __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/assign_value.h>
+#include __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/binary_search.h b/compat/thrust/system/detail/adl/binary_search.h
new file mode 100644
index 0000000..7accfbc
--- /dev/null
+++ b/compat/thrust/system/detail/adl/binary_search.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the binary_search.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch binary_search
+
+#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/binary_search.h>
+#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/binary_search.h>
+#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+
diff --git a/compat/thrust/system/detail/adl/copy.h b/compat/thrust/system/detail/adl/copy.h
new file mode 100644
index 0000000..91a32cd
--- /dev/null
+++ b/compat/thrust/system/detail/adl/copy.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the copy.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch copy
+
+#define __THRUST_HOST_SYSTEM_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy.h>
+#include __THRUST_HOST_SYSTEM_COPY_HEADER
+#undef __THRUST_HOST_SYSTEM_COPY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy.h>
+#include __THRUST_DEVICE_SYSTEM_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COPY_HEADER
+
diff --git a/compat/thrust/system/detail/adl/copy_if.h b/compat/thrust/system/detail/adl/copy_if.h
new file mode 100644
index 0000000..fd1df97
--- /dev/null
+++ b/compat/thrust/system/detail/adl/copy_if.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy_if.h of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the copy_if.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch copy_if
+
+#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+
diff --git a/compat/thrust/system/detail/adl/count.h b/compat/thrust/system/detail/adl/count.h
new file mode 100644
index 0000000..0dd9591
--- /dev/null
+++ b/compat/thrust/system/detail/adl/count.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a count of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the count.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch count
+
+#define __THRUST_HOST_SYSTEM_COUNT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/count.h>
+#include __THRUST_HOST_SYSTEM_COUNT_HEADER
+#undef __THRUST_HOST_SYSTEM_COUNT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COUNT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/count.h>
+#include __THRUST_DEVICE_SYSTEM_COUNT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COUNT_HEADER
+
diff --git a/compat/thrust/system/detail/adl/equal.h b/compat/thrust/system/detail/adl/equal.h
new file mode 100644
index 0000000..f933d4f
--- /dev/null
+++ b/compat/thrust/system/detail/adl/equal.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a equal of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the equal.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch equal
+
+#define __THRUST_HOST_SYSTEM_EQUAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/equal.h>
+#include __THRUST_HOST_SYSTEM_EQUAL_HEADER
+#undef __THRUST_HOST_SYSTEM_EQUAL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_EQUAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/equal.h>
+#include __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
+
diff --git a/compat/thrust/system/detail/adl/extrema.h b/compat/thrust/system/detail/adl/extrema.h
new file mode 100644
index 0000000..c766570
--- /dev/null
+++ b/compat/thrust/system/detail/adl/extrema.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a extrema of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the extrema.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch extrema
+
+#define __THRUST_HOST_SYSTEM_EXTREMA_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/extrema.h>
+#include __THRUST_HOST_SYSTEM_EXTREMA_HEADER
+#undef __THRUST_HOST_SYSTEM_EXTREMA_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/extrema.h>
+#include __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
+#undef __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
+
diff --git a/compat/thrust/system/detail/adl/fill.h b/compat/thrust/system/detail/adl/fill.h
new file mode 100644
index 0000000..b241b8a
--- /dev/null
+++ b/compat/thrust/system/detail/adl/fill.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the fill.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch fill
+
+#define __THRUST_HOST_SYSTEM_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/fill.h>
+#include __THRUST_HOST_SYSTEM_FILL_HEADER
+#undef __THRUST_HOST_SYSTEM_FILL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/fill.h>
+#include __THRUST_DEVICE_SYSTEM_FILL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FILL_HEADER
+
diff --git a/compat/thrust/system/detail/adl/find.h b/compat/thrust/system/detail/adl/find.h
new file mode 100644
index 0000000..7c99f3e
--- /dev/null
+++ b/compat/thrust/system/detail/adl/find.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the find.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch find
+
+#define __THRUST_HOST_SYSTEM_FIND_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/find.h>
+#include __THRUST_HOST_SYSTEM_FIND_HEADER
+#undef __THRUST_HOST_SYSTEM_FIND_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FIND_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/find.h>
+#include __THRUST_DEVICE_SYSTEM_FIND_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FIND_HEADER
+
diff --git a/compat/thrust/system/detail/adl/for_each.h b/compat/thrust/system/detail/adl/for_each.h
new file mode 100644
index 0000000..0b2717f
--- /dev/null
+++ b/compat/thrust/system/detail/adl/for_each.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the for_each.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch for_each
+
+#define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/for_each.h>
+#include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/for_each.h>
+#include __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+
diff --git a/compat/thrust/system/detail/adl/gather.h b/compat/thrust/system/detail/adl/gather.h
new file mode 100644
index 0000000..da4c1d1
--- /dev/null
+++ b/compat/thrust/system/detail/adl/gather.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the gather.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch gather
+
+#define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/gather.h>
+#include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/gather.h>
+#include __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
+
diff --git a/compat/thrust/system/detail/adl/generate.h b/compat/thrust/system/detail/adl/generate.h
new file mode 100644
index 0000000..3a98847
--- /dev/null
+++ b/compat/thrust/system/detail/adl/generate.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the generate.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch generate
+
+#define __THRUST_HOST_SYSTEM_GENERATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/generate.h>
+#include __THRUST_HOST_SYSTEM_GENERATE_HEADER
+#undef __THRUST_HOST_SYSTEM_GENERATE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_GENERATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/generate.h>
+#include __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/get_value.h b/compat/thrust/system/detail/adl/get_value.h
new file mode 100644
index 0000000..ed4ef2c
--- /dev/null
+++ b/compat/thrust/system/detail/adl/get_value.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the get_value.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch get_value
+
+#define __THRUST_HOST_SYSTEM_GET_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/get_value.h>
+#include __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
+#undef __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/get_value.h>
+#include __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/inner_product.h b/compat/thrust/system/detail/adl/inner_product.h
new file mode 100644
index 0000000..18cc65b
--- /dev/null
+++ b/compat/thrust/system/detail/adl/inner_product.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the inner_product.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch inner_product
+
+#define __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/inner_product.h>
+#include __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
+#undef __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/inner_product.h>
+#include __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
+
diff --git a/compat/thrust/system/detail/adl/iter_swap.h b/compat/thrust/system/detail/adl/iter_swap.h
new file mode 100644
index 0000000..b302c25
--- /dev/null
+++ b/compat/thrust/system/detail/adl/iter_swap.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the iter_swap.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch iter_swap
+
+#define __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/iter_swap.h>
+#include __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
+#undef __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/iter_swap.h>
+#include __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
+
diff --git a/compat/thrust/system/detail/adl/logical.h b/compat/thrust/system/detail/adl/logical.h
new file mode 100644
index 0000000..585f71a
--- /dev/null
+++ b/compat/thrust/system/detail/adl/logical.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the logical.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch logical
+
+#define __THRUST_HOST_SYSTEM_LOGICAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/logical.h>
+#include __THRUST_HOST_SYSTEM_LOGICAL_HEADER
+#undef __THRUST_HOST_SYSTEM_LOGICAL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/logical.h>
+#include __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
+
diff --git a/compat/thrust/system/detail/adl/malloc_and_free.h b/compat/thrust/system/detail/adl/malloc_and_free.h
new file mode 100644
index 0000000..7d99a26
--- /dev/null
+++ b/compat/thrust/system/detail/adl/malloc_and_free.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the malloc_and_free.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch malloc_and_free
+
+#define __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/malloc_and_free.h>
+#include __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
+#undef __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/malloc_and_free.h>
+#include __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/merge.h b/compat/thrust/system/detail/adl/merge.h
new file mode 100644
index 0000000..59d8ace
--- /dev/null
+++ b/compat/thrust/system/detail/adl/merge.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the merge.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch merge
+
+#define __THRUST_HOST_SYSTEM_MERGE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/merge.h>
+#include __THRUST_HOST_SYSTEM_MERGE_HEADER
+#undef __THRUST_HOST_SYSTEM_MERGE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MERGE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/merge.h>
+#include __THRUST_DEVICE_SYSTEM_MERGE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MERGE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/mismatch.h b/compat/thrust/system/detail/adl/mismatch.h
new file mode 100644
index 0000000..d2d1831
--- /dev/null
+++ b/compat/thrust/system/detail/adl/mismatch.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the mismatch.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch mismatch
+
+#define __THRUST_HOST_SYSTEM_MISMATCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/mismatch.h>
+#include __THRUST_HOST_SYSTEM_MISMATCH_HEADER
+#undef __THRUST_HOST_SYSTEM_MISMATCH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/mismatch.h>
+#include __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
+
diff --git a/compat/thrust/system/detail/adl/partition.h b/compat/thrust/system/detail/adl/partition.h
new file mode 100644
index 0000000..efdc605
--- /dev/null
+++ b/compat/thrust/system/detail/adl/partition.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the partition.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch partition
+
+#define __THRUST_HOST_SYSTEM_PARTITION_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/partition.h>
+#include __THRUST_HOST_SYSTEM_PARTITION_HEADER
+#undef __THRUST_HOST_SYSTEM_PARTITION_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_PARTITION_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/partition.h>
+#include __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
+#undef __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
+
diff --git a/compat/thrust/system/detail/adl/reduce.h b/compat/thrust/system/detail/adl/reduce.h
new file mode 100644
index 0000000..afa00f9
--- /dev/null
+++ b/compat/thrust/system/detail/adl/reduce.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reduce.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch reduce
+
+#define __THRUST_HOST_SYSTEM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce.h>
+#include __THRUST_HOST_SYSTEM_REDUCE_HEADER
+#undef __THRUST_HOST_SYSTEM_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce.h>
+#include __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/reduce_by_key.h b/compat/thrust/system/detail/adl/reduce_by_key.h
new file mode 100644
index 0000000..eac65b7
--- /dev/null
+++ b/compat/thrust/system/detail/adl/reduce_by_key.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reduce_by_key.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch reduce_by_key
+
+#define __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce_by_key.h>
+#include __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
+
diff --git a/compat/thrust/system/detail/adl/remove.h b/compat/thrust/system/detail/adl/remove.h
new file mode 100644
index 0000000..9d64be8
--- /dev/null
+++ b/compat/thrust/system/detail/adl/remove.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the remove.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch remove
+
+#define __THRUST_HOST_SYSTEM_REMOVE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/remove.h>
+#include __THRUST_HOST_SYSTEM_REMOVE_HEADER
+#undef __THRUST_HOST_SYSTEM_REMOVE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REMOVE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/remove.h>
+#include __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/replace.h b/compat/thrust/system/detail/adl/replace.h
new file mode 100644
index 0000000..e4d8bd2
--- /dev/null
+++ b/compat/thrust/system/detail/adl/replace.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the replace.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch replace
+
+#define __THRUST_HOST_SYSTEM_REPLACE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/replace.h>
+#include __THRUST_HOST_SYSTEM_REPLACE_HEADER
+#undef __THRUST_HOST_SYSTEM_REPLACE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REPLACE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/replace.h>
+#include __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/reverse.h b/compat/thrust/system/detail/adl/reverse.h
new file mode 100644
index 0000000..8cbcfd8
--- /dev/null
+++ b/compat/thrust/system/detail/adl/reverse.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the reverse.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch reverse
+
+#define __THRUST_HOST_SYSTEM_REVERSE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reverse.h>
+#include __THRUST_HOST_SYSTEM_REVERSE_HEADER
+#undef __THRUST_HOST_SYSTEM_REVERSE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_REVERSE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reverse.h>
+#include __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/scan.h b/compat/thrust/system/detail/adl/scan.h
new file mode 100644
index 0000000..e70cd9f
--- /dev/null
+++ b/compat/thrust/system/detail/adl/scan.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scan.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch scan
+
+#define __THRUST_HOST_SYSTEM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan.h>
+#include __THRUST_HOST_SYSTEM_SCAN_HEADER
+#undef __THRUST_HOST_SYSTEM_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan.h>
+#include __THRUST_DEVICE_SYSTEM_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCAN_HEADER
+
diff --git a/compat/thrust/system/detail/adl/scan_by_key.h b/compat/thrust/system/detail/adl/scan_by_key.h
new file mode 100644
index 0000000..02c4b84
--- /dev/null
+++ b/compat/thrust/system/detail/adl/scan_by_key.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scan_by_key.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch scan_by_key
+
+#define __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan_by_key.h>
+#include __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
+
diff --git a/compat/thrust/system/detail/adl/scatter.h b/compat/thrust/system/detail/adl/scatter.h
new file mode 100644
index 0000000..b94b0d9
--- /dev/null
+++ b/compat/thrust/system/detail/adl/scatter.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the scatter.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch scatter
+
+#define __THRUST_HOST_SYSTEM_SCATTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scatter.h>
+#include __THRUST_HOST_SYSTEM_SCATTER_HEADER
+#undef __THRUST_HOST_SYSTEM_SCATTER_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SCATTER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scatter.h>
+#include __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
+
diff --git a/compat/thrust/system/detail/adl/sequence.h b/compat/thrust/system/detail/adl/sequence.h
new file mode 100644
index 0000000..07dcc7b
--- /dev/null
+++ b/compat/thrust/system/detail/adl/sequence.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the sequence.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch sequence
+
+#define __THRUST_HOST_SYSTEM_SEQUENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sequence.h>
+#include __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
+#undef __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sequence.h>
+#include __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/set_operations.h b/compat/thrust/system/detail/adl/set_operations.h
new file mode 100644
index 0000000..9901b46
--- /dev/null
+++ b/compat/thrust/system/detail/adl/set_operations.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the set_operations.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch set_operations
+
+#define __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/set_operations.h>
+#include __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
+#undef __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/set_operations.h>
+#include __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
+
diff --git a/compat/thrust/system/detail/adl/sort.h b/compat/thrust/system/detail/adl/sort.h
new file mode 100644
index 0000000..afcb903
--- /dev/null
+++ b/compat/thrust/system/detail/adl/sort.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the sort.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch sort
+
+#define __THRUST_HOST_SYSTEM_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sort.h>
+#include __THRUST_HOST_SYSTEM_SORT_HEADER
+#undef __THRUST_HOST_SYSTEM_SORT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SORT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sort.h>
+#include __THRUST_DEVICE_SYSTEM_SORT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SORT_HEADER
+
diff --git a/compat/thrust/system/detail/adl/swap_ranges.h b/compat/thrust/system/detail/adl/swap_ranges.h
new file mode 100644
index 0000000..c006936
--- /dev/null
+++ b/compat/thrust/system/detail/adl/swap_ranges.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the swap_ranges.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch swap_ranges
+
+#define __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/swap_ranges.h>
+#include __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
+#undef __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/swap_ranges.h>
+#include __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
+#undef __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
+
diff --git a/compat/thrust/system/detail/adl/tabulate.h b/compat/thrust/system/detail/adl/tabulate.h
new file mode 100644
index 0000000..cb1fdeb
--- /dev/null
+++ b/compat/thrust/system/detail/adl/tabulate.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the tabulate.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch tabulate
+
+#define __THRUST_HOST_SYSTEM_TABULATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/tabulate.h>
+#include __THRUST_HOST_SYSTEM_TABULATE_HEADER
+#undef __THRUST_HOST_SYSTEM_TABULATE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TABULATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/tabulate.h>
+#include __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/temporary_buffer.h b/compat/thrust/system/detail/adl/temporary_buffer.h
new file mode 100644
index 0000000..66df0ea
--- /dev/null
+++ b/compat/thrust/system/detail/adl/temporary_buffer.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the temporary_buffer.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch get_temporary_buffer or return_temporary_buffer
+
+#define __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/temporary_buffer.h>
+#include __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
+#undef __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/temporary_buffer.h>
+#include __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
+
diff --git a/compat/thrust/system/detail/adl/transform.h b/compat/thrust/system/detail/adl/transform.h
new file mode 100644
index 0000000..c9e6a01
--- /dev/null
+++ b/compat/thrust/system/detail/adl/transform.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch transform
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
+
diff --git a/compat/thrust/system/detail/adl/transform_reduce.h b/compat/thrust/system/detail/adl/transform_reduce.h
new file mode 100644
index 0000000..0a5d977
--- /dev/null
+++ b/compat/thrust/system/detail/adl/transform_reduce.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform_reduce.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch transform_reduce
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_reduce.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_reduce.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/transform_scan.h b/compat/thrust/system/detail/adl/transform_scan.h
new file mode 100644
index 0000000..47c1dc3
--- /dev/null
+++ b/compat/thrust/system/detail/adl/transform_scan.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the transform_scan.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch transform_scan
+
+#define __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_scan.h>
+#include __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
+#undef __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_scan.h>
+#include __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
+
diff --git a/compat/thrust/system/detail/adl/uninitialized_copy.h b/compat/thrust/system/detail/adl/uninitialized_copy.h
new file mode 100644
index 0000000..7cb0b8e
--- /dev/null
+++ b/compat/thrust/system/detail/adl/uninitialized_copy.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the uninitialized_copy.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch uninitialized_copy
+
+#define __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_copy.h>
+#include __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
+#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_copy.h>
+#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
+
diff --git a/compat/thrust/system/detail/adl/uninitialized_fill.h b/compat/thrust/system/detail/adl/uninitialized_fill.h
new file mode 100644
index 0000000..9f00b51
--- /dev/null
+++ b/compat/thrust/system/detail/adl/uninitialized_fill.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the uninitialized_fill.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch uninitialized_fill
+
+#define __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_fill.h>
+#include __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
+#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_fill.h>
+#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
+
diff --git a/compat/thrust/system/detail/adl/unique.h b/compat/thrust/system/detail/adl/unique.h
new file mode 100644
index 0000000..932ff58
--- /dev/null
+++ b/compat/thrust/system/detail/adl/unique.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the unique.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch unique
+
+#define __THRUST_HOST_SYSTEM_UNIQUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique.h>
+#include __THRUST_HOST_SYSTEM_UNIQUE_HEADER
+#undef __THRUST_HOST_SYSTEM_UNIQUE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique.h>
+#include __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
+
diff --git a/compat/thrust/system/detail/adl/unique_by_key.h b/compat/thrust/system/detail/adl/unique_by_key.h
new file mode 100644
index 0000000..30e6f2f
--- /dev/null
+++ b/compat/thrust/system/detail/adl/unique_by_key.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a fill of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the unique_by_key.h header
+// of the host and device systems. It should be #included in any
+// code which uses adl to dispatch unique_by_key
+
+#define __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique_by_key.h>
+#include __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
+#undef __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique_by_key.h>
+#include __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
+
diff --git a/compat/thrust/system/detail/bad_alloc.h b/compat/thrust/system/detail/bad_alloc.h
new file mode 100644
index 0000000..bb73d1f
--- /dev/null
+++ b/compat/thrust/system/detail/bad_alloc.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <new>
+#include <string>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+
+// define our own bad_alloc so we can set its .what()
+class bad_alloc
+  : public std::bad_alloc
+{
+  public:
+    inline bad_alloc(const std::string &w)
+      : std::bad_alloc(), m_what()
+    {
+      m_what = std::bad_alloc::what();
+      m_what += ": ";
+      m_what += w;
+    } // end bad_alloc()
+
+    inline virtual ~bad_alloc(void) throw () {};
+
+    inline virtual const char *what(void) const throw()
+    {
+      return m_what.c_str();
+    } // end what()
+
+  private:
+    std::string m_what;
+}; // end bad_alloc
+  
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/errno.h b/compat/thrust/system/detail/errno.h
new file mode 100644
index 0000000..34bc8cc
--- /dev/null
+++ b/compat/thrust/system/detail/errno.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// The rationale for the existence of these apparently redundant definitions is
+// to provide them portably and to avoid bringing in system headers which might
+// pollute the global namespace. These identifiers are in lowercase to avoid
+// colliding with the real macros in errno.h.
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+static const int eafnosupport    = 9901;
+static const int eaddrinuse      = 9902;
+static const int eaddrnotavail   = 9903;
+static const int eisconn         = 9904;
+static const int ebadmsg         = 9905;
+static const int econnaborted    = 9906;
+static const int ealready        = 9907;
+static const int econnrefused    = 9908;
+static const int econnreset      = 9909;
+static const int edestaddrreq    = 9910;
+static const int ehostunreach    = 9911;
+static const int eidrm           = 9912;
+static const int emsgsize        = 9913;
+static const int enetdown        = 9914;
+static const int enetreset       = 9915;
+static const int enetunreach     = 9916;
+static const int enobufs         = 9917;
+static const int enolink         = 9918;
+static const int enodata         = 9919;
+static const int enomsg          = 9920;
+static const int enoprotoopt     = 9921;
+static const int enosr           = 9922;
+static const int enotsock        = 9923;
+static const int enostr          = 9924;
+static const int enotconn        = 9925;
+static const int enotsup         = 9926;
+static const int ecanceled       = 9927;
+static const int einprogress     = 9928;
+static const int eopnotsupp      = 9929;
+static const int ewouldblock     = 9930;
+static const int eownerdead      = 9931;
+static const int eproto          = 9932;
+static const int eprotonosupport = 9933;
+static const int enotrecoverable = 9934;
+static const int etime           = 9935;
+static const int etxtbsy         = 9936;
+static const int etimedout       = 9938;
+static const int eloop           = 9939;
+static const int eoverflow       = 9940;
+static const int eprototype      = 9941;
+static const int enosys          = 9942;
+static const int einval          = 9943;
+static const int erange          = 9944;
+static const int eilseq          = 9945;
+static const int e2big           = 9946;
+static const int edom            = 9947;
+static const int efault          = 9948;
+static const int ebadf           = 9949;
+static const int epipe           = 9950;
+static const int exdev           = 9951;
+static const int ebusy           = 9952;
+static const int enotempty       = 9953;
+static const int enoexec         = 9954;
+static const int eexist          = 9955;
+static const int efbig           = 9956;
+static const int enametoolong    = 9957;
+static const int enotty          = 9958;
+static const int eintr           = 9959;
+static const int espipe          = 9960;
+static const int eio             = 9961;
+static const int eisdir          = 9962;
+static const int echild          = 9963;
+static const int enolck          = 9964;
+static const int enospc          = 9965;
+static const int enxio           = 9966;
+static const int enodev          = 9967;
+static const int enoent          = 9968;
+static const int esrch           = 9969;
+static const int enotdir         = 9970;
+static const int enomem          = 9971;
+static const int eperm           = 9972;
+static const int eacces          = 9973;
+static const int erofs           = 9974;
+static const int edeadlk         = 9975;
+static const int eagain          = 9976;
+static const int enfile          = 9977;
+static const int emfile          = 9978;
+static const int emlink          = 9979;
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
diff --git a/compat/thrust/system/detail/error_category.inl b/compat/thrust/system/detail/error_category.inl
new file mode 100644
index 0000000..8e19c89
--- /dev/null
+++ b/compat/thrust/system/detail/error_category.inl
@@ -0,0 +1,234 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/error_code.h>
+#include <thrust/system/detail/errno.h>
+#include <thrust/functional.h>
+#include <cstring>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_category
+  ::~error_category(void)
+{
+  ;
+} // end error_category::~error_category()
+
+
+error_condition error_category
+  ::default_error_condition(int ev) const
+{
+  return error_condition(ev, *this);
+} // end error_category::default_error_condition()
+
+
+bool error_category
+  ::equivalent(int code, const error_condition &condition) const
+{
+  return default_error_condition(code) == condition;
+} // end error_condition::equivalent()
+
+
+bool error_category
+  ::equivalent(const error_code &code, int condition) const
+{
+  bool result = (this->operator==(code.category())) && (code.value() == condition);
+  return result;
+} // end error_code::equivalent()
+
+
+bool error_category
+  ::operator==(const error_category &rhs) const
+{
+  return this == &rhs;
+} // end error_category::operator==()
+
+
+bool error_category
+  ::operator!=(const error_category &rhs) const
+{
+  return !this->operator==(rhs);
+} // end error_category::operator!=()
+
+
+bool error_category
+  ::operator<(const error_category &rhs) const
+{
+  return thrust::less<const error_category*>()(this,&rhs);
+} // end error_category::operator<()
+
+
+namespace detail
+{
+
+
+class generic_error_category
+  : public error_category
+{
+  public:
+    inline generic_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "generic";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      static const std::string unknown_err("Unknown error");
+
+      // XXX strerror is not thread-safe:
+      //     prefer strerror_r (which is not provided on windows)
+      const char *c_str = std::strerror(ev);
+      return c_str ? std::string(c_str) : unknown_err;
+    }
+}; // end generic_category_result
+
+
+class system_error_category
+  : public error_category
+{
+  public:
+    inline system_error_category(void) {}
+
+    inline virtual const char *name(void) const
+    {
+      return "system";
+    }
+
+    inline virtual std::string message(int ev) const
+    {
+      return generic_category().message(ev);
+    }
+
+    inline virtual error_condition default_error_condition(int ev) const
+    {
+      using namespace errc;
+
+      switch(ev)
+      {
+        case eafnosupport:    return make_error_condition(address_family_not_supported);
+        case eaddrinuse:      return make_error_condition(address_in_use);
+        case eaddrnotavail:   return make_error_condition(address_not_available);
+        case eisconn:         return make_error_condition(already_connected);
+        case e2big:           return make_error_condition(argument_list_too_long);
+        case edom:            return make_error_condition(argument_out_of_domain);
+        case efault:          return make_error_condition(bad_address);
+        case ebadf:           return make_error_condition(bad_file_descriptor);
+        case ebadmsg:         return make_error_condition(bad_message);
+        case epipe:           return make_error_condition(broken_pipe);
+        case econnaborted:    return make_error_condition(connection_aborted);
+        case ealready:        return make_error_condition(connection_already_in_progress);
+        case econnrefused:    return make_error_condition(connection_refused);
+        case econnreset:      return make_error_condition(connection_reset);
+        case exdev:           return make_error_condition(cross_device_link);
+        case edestaddrreq:    return make_error_condition(destination_address_required);
+        case ebusy:           return make_error_condition(device_or_resource_busy);
+        case enotempty:       return make_error_condition(directory_not_empty);
+        case enoexec:         return make_error_condition(executable_format_error);
+        case eexist:          return make_error_condition(file_exists);
+        case efbig:           return make_error_condition(file_too_large);
+        case enametoolong:    return make_error_condition(filename_too_long);
+        case enosys:          return make_error_condition(function_not_supported);
+        case ehostunreach:    return make_error_condition(host_unreachable);
+        case eidrm:           return make_error_condition(identifier_removed);
+        case eilseq:          return make_error_condition(illegal_byte_sequence);
+        case enotty:          return make_error_condition(inappropriate_io_control_operation);
+        case eintr:           return make_error_condition(interrupted);
+        case einval:          return make_error_condition(invalid_argument);
+        case espipe:          return make_error_condition(invalid_seek);
+        case eio:             return make_error_condition(io_error);
+        case eisdir:          return make_error_condition(is_a_directory);
+        case emsgsize:        return make_error_condition(message_size);
+        case enetdown:        return make_error_condition(network_down);
+        case enetreset:       return make_error_condition(network_reset);
+        case enetunreach:     return make_error_condition(network_unreachable);
+        case enobufs:         return make_error_condition(no_buffer_space);
+        case echild:          return make_error_condition(no_child_process);
+        case enolink:         return make_error_condition(no_link);
+        case enolck:          return make_error_condition(no_lock_available);
+        case enodata:         return make_error_condition(no_message_available);
+        case enomsg:          return make_error_condition(no_message);
+        case enoprotoopt:     return make_error_condition(no_protocol_option);
+        case enospc:          return make_error_condition(no_space_on_device);
+        case enosr:           return make_error_condition(no_stream_resources);
+        case enxio:           return make_error_condition(no_such_device_or_address);
+        case enodev:          return make_error_condition(no_such_device);
+        case enoent:          return make_error_condition(no_such_file_or_directory);
+        case esrch:           return make_error_condition(no_such_process);
+        case enotdir:         return make_error_condition(not_a_directory);
+        case enotsock:        return make_error_condition(not_a_socket);
+        case enostr:          return make_error_condition(not_a_stream);
+        case enotconn:        return make_error_condition(not_connected);
+        case enomem:          return make_error_condition(not_enough_memory);
+        case enotsup:         return make_error_condition(not_supported);
+        case ecanceled:       return make_error_condition(operation_canceled);
+        case einprogress:     return make_error_condition(operation_in_progress);
+        case eperm:           return make_error_condition(operation_not_permitted);
+        case eopnotsupp:      return make_error_condition(operation_not_supported);
+        case ewouldblock:     return make_error_condition(operation_would_block);
+        case eownerdead:      return make_error_condition(owner_dead);
+        case eacces:          return make_error_condition(permission_denied);
+        case eproto:          return make_error_condition(protocol_error);
+        case eprotonosupport: return make_error_condition(protocol_not_supported);
+        case erofs:           return make_error_condition(read_only_file_system);
+        case edeadlk:         return make_error_condition(resource_deadlock_would_occur);
+        case eagain:          return make_error_condition(resource_unavailable_try_again);
+        case erange:          return make_error_condition(result_out_of_range);
+        case enotrecoverable: return make_error_condition(state_not_recoverable);
+        case etime:           return make_error_condition(stream_timeout);
+        case etxtbsy:         return make_error_condition(text_file_busy);
+        case etimedout:       return make_error_condition(timed_out);
+        case enfile:          return make_error_condition(too_many_files_open_in_system);
+        case emfile:          return make_error_condition(too_many_files_open);
+        case emlink:          return make_error_condition(too_many_links);
+        case eloop:           return make_error_condition(too_many_symbolic_link_levels);
+        case eoverflow:       return make_error_condition(value_too_large);
+        case eprototype:      return make_error_condition(wrong_protocol_type);
+        default:              return error_condition(ev,system_category());
+      }
+    }
+}; // end system_category_result
+
+
+} // end detail
+
+
+const error_category &generic_category(void)
+{
+  static const detail::generic_error_category result;
+  return result;
+}
+
+
+const error_category &system_category(void)
+{
+  static const detail::system_error_category result;
+  return result;
+}
+
+
+} // end system
+
+} // end thrust
+
diff --git a/compat/thrust/system/detail/error_code.inl b/compat/thrust/system/detail/error_code.inl
new file mode 100644
index 0000000..0cf86b4
--- /dev/null
+++ b/compat/thrust/system/detail/error_code.inl
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/error_code.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_code
+  ::error_code(void)
+    :m_val(0),m_cat(&system_category())
+{
+  ;
+} // end error_code::error_code()
+
+
+error_code
+  ::error_code(int val, const error_category &cat)
+    :m_val(val),m_cat(&cat)
+{
+  ;
+} // end error_code::error_code()
+
+
+template <typename ErrorCodeEnum>
+  error_code
+    ::error_code(ErrorCodeEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                 , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type *
+#endif // THRUST_HOST_COMPILER_MSVC
+                )
+{
+  *this = make_error_code(e);
+} // end error_code::error_code()
+
+
+void error_code
+  ::assign(int val, const error_category &cat)
+{
+  m_val = val;
+  m_cat = &cat;
+} // end error_code::assign()
+
+
+template <typename ErrorCodeEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+  typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
+#else
+  error_code &
+#endif // THRUST_HOST_COMPILER_MSVC
+    error_code
+      ::operator=(ErrorCodeEnum e)
+{
+  *this = make_error_code(e);
+  return *this;
+} // end error_code::operator=()
+
+
+void error_code
+  ::clear(void)
+{
+  m_val = 0;
+  m_cat = &system_category();
+} // end error_code::clear()
+
+
+int error_code
+  ::value(void) const
+{
+  return m_val;
+} // end error_code::value()
+
+
+const error_category &error_code
+  ::category(void) const
+{
+  return *m_cat;
+} // end error_code::category()
+
+
+error_condition error_code
+  ::default_error_condition(void) const
+{
+  return category().default_error_condition(value());
+} // end error_code::default_error_condition()
+
+
+std::string error_code
+  ::message(void) const
+{
+  return category().message(value());
+} // end error_code::message()
+
+
+error_code
+  ::operator bool (void) const
+{
+  return value() != 0;
+} // end error_code::operator bool ()
+
+
+error_code make_error_code(errc::errc_t e)
+{
+  return error_code(static_cast<int>(e), generic_category());
+} // end make_error_code()
+
+
+bool operator<(const error_code &lhs, const error_code &rhs)
+{
+  bool result = lhs.category().operator<(rhs.category());
+  result = result || lhs.category().operator==(rhs.category());
+  result = result || lhs.value() < rhs.value();
+  return result;
+} // end operator==()
+
+
+template<typename charT, typename traits>
+  std::basic_ostream<charT,traits>&
+    operator<<(std::basic_ostream<charT,traits> &os, const error_code &ec)
+{
+  return os << ec.category().name() << ':' << ec.value();
+} // end operator<<()
+
+
+bool operator==(const error_code &lhs, const error_code &rhs)
+{
+  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
+} // end operator==()
+
+
+bool operator==(const error_code &lhs, const error_condition &rhs)
+{
+  return lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value());
+} // end operator==()
+
+
+bool operator==(const error_condition &lhs, const error_code &rhs)
+{
+  return rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value());
+} // end operator==()
+
+
+bool operator==(const error_condition &lhs, const error_condition &rhs)
+{
+  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
+} // end operator==()
+
+
+bool operator!=(const error_code &lhs, const error_code &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_code &lhs, const error_condition &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_condition &lhs, const error_code &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+bool operator!=(const error_condition &lhs, const error_condition &rhs)
+{
+  return !(lhs == rhs);
+} // end operator!=()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/compat/thrust/system/detail/error_condition.inl b/compat/thrust/system/detail/error_condition.inl
new file mode 100644
index 0000000..00fbaf0
--- /dev/null
+++ b/compat/thrust/system/detail/error_condition.inl
@@ -0,0 +1,133 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/detail/error_condition.inl>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+error_condition
+  ::error_condition(void)
+    :m_val(0),m_cat(&generic_category())
+{
+  ;
+} // end error_condition::error_condition()
+
+
+error_condition
+  ::error_condition(int val, const error_category &cat)
+    :m_val(val),m_cat(&cat)
+{
+  ;
+} // end error_condition::error_condition()
+
+
+template<typename ErrorConditionEnum>
+  error_condition
+    ::error_condition(ErrorConditionEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                      , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type *
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     )
+{
+  *this = make_error_condition(e);
+} // end error_condition::error_condition()
+
+
+void error_condition
+  ::assign(int val, const error_category &cat)
+{
+  m_val = val;
+  m_cat = &cat;
+} // end error_category::assign()
+
+
+template<typename ErrorConditionEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+  typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
+#else
+  error_condition &
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+    error_condition
+      ::operator=(ErrorConditionEnum e)
+{
+  *this = make_error_condition(e);
+  return *this;
+} // end error_condition::operator=()
+
+
+void error_condition
+  ::clear(void)
+{
+  m_val = 0;
+  m_cat = &generic_category();
+} // end error_condition::clear()
+
+
+int error_condition
+  ::value(void) const
+{
+  return m_val;
+} // end error_condition::value()
+
+
+const error_category &error_condition
+  ::category(void) const
+{
+  return *m_cat;
+} // end error_condition::category()
+
+
+std::string error_condition
+  ::message(void) const
+{
+  return category().message(value());
+} // end error_condition::message()
+
+
+error_condition
+  ::operator bool (void) const
+{
+  return value() != 0;
+} // end error_condition::operator bool ()
+
+
+error_condition make_error_condition(errc::errc_t e)
+{
+  return error_condition(static_cast<int>(e), generic_category());
+} // end make_error_condition()
+
+
+bool operator<(const error_condition &lhs,
+               const error_condition &rhs)
+{
+  return lhs.category().operator<(rhs.category()) || (lhs.category().operator==(rhs.category()) && (lhs.value() < rhs.value()));
+} // end operator<()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/adjacent_difference.h b/compat/thrust/system/detail/generic/adjacent_difference.h
new file mode 100644
index 0000000..bb340df
--- /dev/null
+++ b/compat/thrust/system/detail/generic/adjacent_difference.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Generic implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename DerivedPolicy, class InputIterator, class OutputIterator>
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result);
+
+template <typename DerivedPolicy, class InputIterator, class OutputIterator, class BinaryFunction>
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/adjacent_difference.inl>
+
diff --git a/compat/thrust/system/detail/generic/adjacent_difference.inl b/compat/thrust/system/detail/generic/adjacent_difference.inl
new file mode 100644
index 0000000..619b29f
--- /dev/null
+++ b/compat/thrust/system/detail/generic/adjacent_difference.inl
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/transform.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename DerivedPolicy, class InputIterator, class OutputIterator>
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  thrust::minus<InputType> binary_op;
+
+  return thrust::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+template <typename DerivedPolicy, class InputIterator, class OutputIterator, class BinaryFunction>
+OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                   InputIterator first, InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+
+  if(first == last)
+  {
+    // empty range, nothing to do
+    return result; 
+  }
+  else 
+  {
+    // an in-place operation is requested, copy the input and call the entry point
+    // XXX a special-purpose kernel would be faster here since
+    // only block boundaries need to be copied
+    thrust::detail::temporary_array<InputType, DerivedPolicy> input_copy(exec, first, last);
+    
+    *result = *first;
+    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op); 
+  }
+
+  return result + (last - first);
+}
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/advance.h b/compat/thrust/system/detail/generic/advance.h
new file mode 100644
index 0000000..249aac7
--- /dev/null
+++ b/compat/thrust/system/detail/generic/advance.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/advance.inl>
+
diff --git a/compat/thrust/system/detail/generic/advance.inl b/compat/thrust/system/detail/generic/advance.inl
new file mode 100644
index 0000000..b95737a
--- /dev/null
+++ b/compat/thrust/system/detail/generic/advance.inl
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n, thrust::incrementable_traversal_tag)
+{
+  while(n)
+  {
+    ++i;
+    --n;
+  } // end while
+} // end advance()
+
+template<typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n, thrust::random_access_traversal_tag)
+{
+  i += n;
+} // end advance()
+
+} // end detail
+
+template<typename InputIterator, typename Distance>
+void advance(InputIterator& i, Distance n)
+{
+  // dispatch on iterator traversal
+  thrust::system::detail::generic::detail::advance(i, n,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end advance()
+
+} // end namespace detail
+} // end namespace generic
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/binary_search.h b/compat/thrust/system/detail/generic/binary_search.h
new file mode 100644
index 0000000..7fd6c50
--- /dev/null
+++ b/compat/thrust/system/detail/generic/binary_search.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Generic implementations of binary search functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec, 
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec, 
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp);
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value,
+            StrictWeakOrdering comp);
+
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/binary_search.inl>
+
diff --git a/compat/thrust/system/detail/generic/binary_search.inl b/compat/thrust/system/detail/generic/binary_search.inl
new file mode 100644
index 0000000..151ac0e
--- /dev/null
+++ b/compat/thrust/system/detail/generic/binary_search.inl
@@ -0,0 +1,342 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.inl
+ *  \brief Inline file for binary_search.h
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/binary_search.h>
+
+#include <thrust/for_each.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/generic/scalar/binary_search.h>
+
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace detail
+{
+
+// XXX WAR circular #inclusion with this forward declaration
+template<typename,typename> class temporary_array;
+
+} // end detail
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+
+// short names to avoid nvcc bug
+struct lbf
+{
+    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+    __host__ __device__
+    typename thrust::iterator_traits<RandomAccessIterator>::difference_type
+    operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
+    {
+        return thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp) - begin;
+    }
+};
+
+struct ubf
+{
+    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+        __host__ __device__
+        typename thrust::iterator_traits<RandomAccessIterator>::difference_type
+     operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp){
+         return thrust::system::detail::generic::scalar::upper_bound(begin, end, value, comp) - begin;
+     }
+};
+
+struct bsf
+{
+    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
+        __host__ __device__
+     bool operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp){
+         RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
+
+         thrust::detail::host_device_function<StrictWeakOrdering,bool> wrapped_comp(comp);
+
+         return iter != end && !wrapped_comp(value, *iter);
+     }
+};
+
+
+template <typename ForwardIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
+struct binary_search_functor
+{
+    ForwardIterator begin;
+    ForwardIterator end;
+    StrictWeakOrdering comp;
+    BinarySearchFunction func;
+
+    binary_search_functor(ForwardIterator begin, ForwardIterator end, StrictWeakOrdering comp, BinarySearchFunction func)
+        : begin(begin), end(end), comp(comp), func(func) {}
+
+    template <typename Tuple>
+        __host__ __device__
+        void operator()(Tuple t)
+        {
+            thrust::get<1>(t) = func(begin, end, thrust::get<0>(t), comp);
+        }
+}; // binary_search_functor
+
+
+// Vector Implementation
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp,
+                             BinarySearchFunction func)
+{
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(values_begin, output)),
+                     thrust::make_zip_iterator(thrust::make_tuple(values_end, output + thrust::distance(values_begin, values_end))),
+                     detail::binary_search_functor<ForwardIterator, StrictWeakOrdering, BinarySearchFunction>(begin, end, comp, func));
+
+    return output + thrust::distance(values_begin, values_end);
+}
+
+   
+
+// Scalar Implementation
+template <typename OutputType, typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename BinarySearchFunction>
+OutputType binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator begin,
+                         ForwardIterator end,
+                         const T& value, 
+                         StrictWeakOrdering comp,
+                         BinarySearchFunction func)
+{
+    // use the vectorized path to implement the scalar version
+
+    // allocate device buffers for value and output
+    thrust::detail::temporary_array<T,DerivedPolicy>          d_value(exec,1);
+    thrust::detail::temporary_array<OutputType,DerivedPolicy> d_output(exec,1);
+
+    // copy value to device
+    d_value[0] = value;
+
+    // perform the query
+    thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
+
+    // copy result to host and return
+    return d_output[0];
+}
+   
+} // end namespace detail
+
+
+//////////////////////
+// Scalar Functions //
+//////////////////////
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value)
+{
+  return thrust::lower_bound(exec, begin, end, value, thrust::less<T>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
+  
+  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value)
+{
+  return thrust::upper_bound(exec, begin, end, value, thrust::less<T>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
+  
+  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::ubf());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T>
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value)
+{
+  return thrust::binary_search(exec, begin, end, value, thrust::less<T>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+  return detail::binary_search<bool>(exec, begin, end, value, comp, detail::bsf());
+}
+
+
+//////////////////////
+// Vector Functions //
+//////////////////////
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
+
+  return thrust::lower_bound(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::lbf());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
+
+  return thrust::upper_bound(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
+                           ForwardIterator begin, 
+                           ForwardIterator end,
+                           InputIterator values_begin, 
+                           InputIterator values_end,
+                           OutputIterator output,
+                           StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::ubf());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
+
+  return thrust::binary_search(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
+}
+
+template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
+OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
+                             ForwardIterator begin, 
+                             ForwardIterator end,
+                             InputIterator values_begin, 
+                             InputIterator values_end,
+                             OutputIterator output,
+                             StrictWeakOrdering comp)
+{
+  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::bsf());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const LessThanComparable &value)
+{
+  return thrust::equal_range(exec, first, last, value, thrust::less<LessThanComparable>());
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+thrust::pair<ForwardIterator,ForwardIterator>
+equal_range(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value,
+            StrictWeakOrdering comp)
+{
+  ForwardIterator lb = thrust::lower_bound(exec, first, last, value, comp);
+  ForwardIterator ub = thrust::upper_bound(exec, first, last, value, comp);
+  return thrust::make_pair(lb, ub);
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/copy.h b/compat/thrust/system/detail/generic/copy.h
new file mode 100644
index 0000000..8df98fe
--- /dev/null
+++ b/compat/thrust/system/detail/generic/copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator  first,
+                      InputIterator  last,
+                      OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator  first,
+                        Size           n,
+                        OutputIterator result);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/copy.inl>
+
diff --git a/compat/thrust/system/detail/generic/copy.inl b/compat/thrust/system/detail/generic/copy.inl
new file mode 100644
index 0000000..e081015
--- /dev/null
+++ b/compat/thrust/system/detail/generic/copy.inl
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/transform.h>
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/detail/minimum_system.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator                            first,
+                      InputIterator                            last,
+                      OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type T;
+  return thrust::transform(exec, first, last, result, thrust::identity<T>());
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator                            first,
+                        Size                                     n,
+                        OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type value_type;
+  typedef thrust::identity<value_type>                         xfrm_type;
+
+  // XXX why do we need to do this? figure out why, and then see if we can do without
+  typedef typename thrust::detail::unary_transform_functor<DerivedPolicy,xfrm_type>::type functor_type;
+
+  typedef thrust::tuple<InputIterator,OutputIterator> iterator_tuple;
+  typedef thrust::zip_iterator<iterator_tuple>        zip_iter;
+
+  zip_iter zipped = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  return thrust::get<1>(thrust::for_each_n(exec, zipped, n, functor_type(xfrm_type())).get_iterator_tuple());
+} // end copy_n()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/copy_if.h b/compat/thrust/system/detail/generic/copy_if.h
new file mode 100644
index 0000000..183f012
--- /dev/null
+++ b/compat/thrust/system/detail/generic/copy_if.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator result,
+                          Predicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/copy_if.inl>
+
diff --git a/compat/thrust/system/detail/generic/copy_if.inl b/compat/thrust/system/detail/generic/copy_if.inl
new file mode 100644
index 0000000..145561c
--- /dev/null
+++ b/compat/thrust/system/detail/generic/copy_if.inl
@@ -0,0 +1,155 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/copy_if.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/functional.h>
+#include <thrust/distance.h>
+#include <thrust/transform.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename IndexType,
+         typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first,
+                       InputIterator1 last,
+                       InputIterator2 stencil,
+                       OutputIterator result,
+                       Predicate pred)
+{
+    __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(IndexType n = thrust::distance(first, last));
+
+    // compute {0,1} predicates
+    thrust::detail::temporary_array<IndexType, DerivedPolicy> predicates(exec, n);
+    thrust::transform(exec,
+                      stencil,
+                      stencil + n,
+                      predicates.begin(),
+                      thrust::detail::predicate_to_integral<Predicate,IndexType>(pred));
+
+    // scan {0,1} predicates
+    thrust::detail::temporary_array<IndexType, DerivedPolicy> scatter_indices(exec, n);
+    thrust::exclusive_scan(exec,
+                           predicates.begin(),
+                           predicates.end(),
+                           scatter_indices.begin(),
+                           static_cast<IndexType>(0),
+                           thrust::plus<IndexType>());
+
+    // scatter the true elements
+    thrust::scatter_if(exec,
+                       first,
+                       last,
+                       scatter_indices.begin(),
+                       predicates.begin(),
+                       result,
+                       thrust::identity<IndexType>());
+
+    // find the end of the new sequence
+    IndexType output_size = scatter_indices[n - 1] + predicates[n - 1];
+
+    return result + output_size;
+}
+
+} // end namespace detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         InputIterator last,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  // XXX it's potentially expensive to send [first,last) twice
+  //     we should probably specialize this case for POD
+  //     since we can safely keep the input in a temporary instead
+  //     of doing two loads
+  return thrust::copy_if(exec, first, last, first, result, pred);
+} // end copy_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator result,
+                          Predicate pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+  
+  // empty sequence
+  if(first == last)
+    return result;
+  
+  difference_type n = thrust::distance(first, last);
+  
+  // create an unsigned version of n (we know n is positive from the comparison above)
+  // to avoid a warning in the compare below
+  typename thrust::detail::make_unsigned<difference_type>::type unsigned_n(n);
+  
+  // use 32-bit indices when possible (almost always)
+  if(sizeof(difference_type) > sizeof(unsigned int) && unsigned_n > (std::numeric_limits<unsigned int>::max)())
+  {
+    result = detail::copy_if<difference_type>(exec, first, last, stencil, result, pred);
+  } // end if
+  else
+  {
+    result = detail::copy_if<unsigned int>(exec, first, last, stencil, result, pred);
+  } // end else
+
+  return result;
+} // end copy_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/count.h b/compat/thrust/system/detail/generic/count.h
new file mode 100644
index 0000000..bc4899e
--- /dev/null
+++ b/compat/thrust/system/detail/generic/count.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/count.inl>
+
diff --git a/compat/thrust/system/detail/generic/count.inl b/compat/thrust/system/detail/generic/count.inl
new file mode 100644
index 0000000..e3ab871
--- /dev/null
+++ b/compat/thrust/system/detail/generic/count.inl
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/count.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename InputType, typename Predicate, typename CountType>
+struct count_if_transform
+{
+  __host__ __device__ 
+  count_if_transform(Predicate _pred) : pred(_pred){}
+
+  __host__ __device__
+  CountType operator()(const InputType& val)
+  {
+    if(pred(val))
+      return 1;
+    else
+      return 0;
+  } // end operator()
+
+  Predicate pred;
+}; // end count_if_transform
+
+template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  
+  // XXX use placeholder expression here
+  return thrust::count_if(exec, first, last, thrust::detail::equal_to_value<EqualityComparable>(value));
+} // end count()
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+typename thrust::iterator_traits<InputIterator>::difference_type
+count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  typedef typename thrust::iterator_traits<InputIterator>::difference_type CountType;
+  
+  thrust::system::detail::generic::count_if_transform<InputType, Predicate, CountType> unary_op(pred);
+  thrust::plus<CountType> binary_op;
+  return thrust::transform_reduce(exec, first, last, unary_op, CountType(0), binary_op);
+} // end count_if()
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/distance.h b/compat/thrust/system/detail/generic/distance.h
new file mode 100644
index 0000000..80f051c
--- /dev/null
+++ b/compat/thrust/system/detail/generic/distance.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/distance.inl>
+
diff --git a/compat/thrust/system/detail/generic/distance.inl b/compat/thrust/system/detail/generic/distance.inl
new file mode 100644
index 0000000..a1fdf14
--- /dev/null
+++ b/compat/thrust/system/detail/generic/distance.inl
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag)
+{
+  typename thrust::iterator_traits<InputIterator>::difference_type result(0);
+
+  while(first != last)
+  {
+    ++first;
+    ++result;
+  } // end while
+
+  return result;
+} // end advance()
+
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last, thrust::random_access_traversal_tag)
+{
+  return last - first;
+} // end distance()
+
+} // end detail
+
+template<typename InputIterator>
+  inline typename thrust::iterator_traits<InputIterator>::difference_type
+    distance(InputIterator first, InputIterator last)
+{
+  // dispatch on iterator traversal
+  return thrust::system::detail::generic::detail::distance(first, last,
+    typename thrust::iterator_traversal<InputIterator>::type());
+} // end advance()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/equal.h b/compat/thrust/system/detail/generic/equal.h
new file mode 100644
index 0000000..da7d105
--- /dev/null
+++ b/compat/thrust/system/detail/generic/equal.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/equal.inl>
+
diff --git a/compat/thrust/system/detail/generic/equal.inl b/compat/thrust/system/detail/generic/equal.inl
new file mode 100644
index 0000000..12b8005
--- /dev/null
+++ b/compat/thrust/system/detail/generic/equal.inl
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/equal.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/mismatch.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  
+  return thrust::equal(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
+}
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
+{
+  return thrust::mismatch(exec, first1, last1, first2, binary_pred).first == last1;
+}
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/extrema.h b/compat/thrust/system/detail/generic/extrema.h
new file mode 100644
index 0000000..abb4ddc
--- /dev/null
+++ b/compat/thrust/system/detail/generic/extrema.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file extrema.h
+ *  \brief Generic device implementations of extrema functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp);
+
+template <typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp);
+
+template <typename DerivedPolicy, typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last);
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/extrema.inl>
+
diff --git a/compat/thrust/system/detail/generic/extrema.inl b/compat/thrust/system/detail/generic/extrema.inl
new file mode 100644
index 0000000..b5f92c3
--- /dev/null
+++ b/compat/thrust/system/detail/generic/extrema.inl
@@ -0,0 +1,244 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file distance.h
+ *  \brief Device implementations for distance.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+//////////////
+// Functors //
+//////////////
+
+// return the smaller/larger element making sure to prefer the 
+// first occurance of the minimum/maximum element
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct min_element_reduction
+{
+  BinaryPredicate comp;
+
+  __host__ __device__ 
+  min_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple<InputType, IndexType>
+  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
+             const thrust::tuple<InputType, IndexType>& rhs )
+  {
+    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
+      return lhs;
+    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
+      return rhs;
+
+    // values are equivalent, prefer value with smaller index
+    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
+      return lhs;
+    else
+      return rhs;
+  } // end operator()()
+
+}; // end min_element_reduction
+
+
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct max_element_reduction
+{
+  BinaryPredicate comp;
+
+  __host__ __device__ 
+  max_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple<InputType, IndexType>
+  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
+             const thrust::tuple<InputType, IndexType>& rhs )
+  {
+    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
+      return rhs;
+    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
+      return lhs;
+
+    // values are equivalent, prefer value with smaller index
+    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
+      return lhs;
+    else
+      return rhs;
+  } // end operator()()
+
+}; // end max_element_reduction
+
+// return the smaller & larger element making sure to prefer the 
+// first occurance of the minimum/maximum element
+template <typename InputType, typename IndexType, typename BinaryPredicate>
+struct minmax_element_reduction
+{
+  BinaryPredicate comp;
+
+  minmax_element_reduction(BinaryPredicate comp) : comp(comp){}
+
+  __host__ __device__ 
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
+  operator()(const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& lhs, 
+             const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& rhs )
+  {
+
+    return thrust::make_tuple(min_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<0>(lhs), thrust::get<0>(rhs)),
+                              max_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+  } // end operator()()
+}; // end minmax_element_reduction
+
+template <typename InputType, typename IndexType>
+struct duplicate_tuple
+{
+  __host__ __device__ 
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
+  operator()(const thrust::tuple<InputType,IndexType>& t)
+  {
+    return thrust::make_tuple(t, t);
+  }
+}; // end duplicate_tuple
+
+} // end namespace detail
+
+template <typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::min_element(exec, first, last, thrust::less<value_type>());
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  if (first == last)
+    return last;
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple<InputType, IndexType> result =
+    thrust::reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       thrust::tuple<InputType, IndexType>(*first, 0),
+       detail::min_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return first + thrust::get<1>(result);
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator>
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::max_element(exec, first, last, thrust::less<value_type>());
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  if (first == last)
+    return last;
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple<InputType, IndexType> result =
+    thrust::reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       thrust::tuple<InputType, IndexType>(*first, 0),
+       detail::max_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return first + thrust::get<1>(result);
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+
+  return thrust::minmax_element(exec, first, last, thrust::less<value_type>());
+} // end minmax_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  if (first == last)
+    return thrust::make_pair(last, last);
+
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
+  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
+
+  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> > result = 
+    thrust::transform_reduce
+      (exec,
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
+       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
+       detail::duplicate_tuple<InputType, IndexType>(),
+       detail::duplicate_tuple<InputType, IndexType>()(thrust::tuple<InputType, IndexType>(*first, 0)),
+       detail::minmax_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
+
+  return thrust::make_pair(first + thrust::get<1>(thrust::get<0>(result)), first + thrust::get<1>(thrust::get<1>(result)));
+} // end minmax_element()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/fill.h b/compat/thrust/system/detail/generic/fill.h
new file mode 100644
index 0000000..9745b1c
--- /dev/null
+++ b/compat/thrust/system/detail/generic/fill.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file fill.h
+ *  \brief Device implementation of fill.
+ */
+
+#pragma once
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/generate.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
+  OutputIterator fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                        OutputIterator first,
+                        Size n,
+                        const T &value)
+{
+  // XXX consider using the placeholder expression _1 = value
+  return thrust::generate_n(exec, first, n, thrust::detail::fill_functor<T>(value));
+}
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(thrust::execution_policy<DerivedPolicy> &exec,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value)
+{
+  // XXX consider using the placeholder expression _1 = value
+  thrust::generate(exec, first, last, thrust::detail::fill_functor<T>(value));
+}
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/find.h b/compat/thrust/system/detail/generic/find.h
new file mode 100644
index 0000000..08888c5
--- /dev/null
+++ b/compat/thrust/system/detail/generic/find.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value);
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/find.inl>
+
diff --git a/compat/thrust/system/detail/generic/find.inl b/compat/thrust/system/detail/generic/find.inl
new file mode 100644
index 0000000..a3414e1
--- /dev/null
+++ b/compat/thrust/system/detail/generic/find.inl
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/find.h>
+#include <thrust/reduce.h>
+
+#include <thrust/tuple.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+
+// Contributed by Erich Elsen
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   const T& value)
+{
+  // XXX consider a placeholder expression here
+  return thrust::find_if(exec, first, last, thrust::detail::equal_to_value<T>(value));
+} // end find()
+
+
+template<typename TupleType>
+struct find_if_functor
+{
+    __host__ __device__
+    TupleType operator()(const TupleType& lhs, const TupleType& rhs) const
+    {
+        // select the smallest index among true results
+        if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
+            return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+        else if (thrust::get<0>(lhs))
+            return lhs;
+        else
+            return rhs;
+    }
+};
+    
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+    typedef typename thrust::iterator_traits<InputIterator>::difference_type difference_type;
+    typedef typename thrust::tuple<bool,difference_type> result_type;
+   
+    // empty sequence
+    if (first == last)
+        return last;
+
+    const difference_type n = thrust::distance(first, last);
+
+    // this implementation breaks up the sequence into separate intervals
+    // in an attempt to early-out as soon as a value is found
+
+    // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
+    const difference_type interval_threshold = 1 << 20;
+    const difference_type interval_size = (std::min)(interval_threshold, n);
+
+    // force transform_iterator output to bool
+    typedef thrust::transform_iterator<Predicate, InputIterator, bool> XfrmIterator;
+    typedef thrust::tuple<XfrmIterator, thrust::counting_iterator<difference_type> > IteratorTuple;
+    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+    IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, pred),
+                                                  thrust::counting_iterator<difference_type>(0));
+
+    ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
+    ZipIterator end   = begin + n;
+
+    for(ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
+    {
+        ZipIterator interval_end = interval_begin + interval_size;
+        if(end < interval_end)
+        {
+          interval_end = end;
+        } // end if
+
+        result_type result = thrust::reduce(exec,
+                                            interval_begin, interval_end,
+                                            result_type(false,interval_end - begin),
+                                            find_if_functor<result_type>());
+
+        // see if we found something
+        if (thrust::get<0>(result))
+        {
+            return first + thrust::get<1>(result);
+        }
+    }
+
+    //nothing was found if we reach here...
+    return first + n;
+}
+
+
+template<typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          Predicate pred)
+{
+    return thrust::find_if(exec, first, last, thrust::detail::not1(pred));
+} // end find()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/for_each.h b/compat/thrust/system/detail/generic/for_each.h
new file mode 100644
index 0000000..61abe20
--- /dev/null
+++ b/compat/thrust/system/detail/generic/for_each.h
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file for_each.h
+ *  \brief Generic implementation of for_each & for_each_n.
+ *         It is an error to call these functions; they have no implementation.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/static_assert.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  return first;
+} // end for_each()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  return first;
+} // end for_each_n()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/gather.h b/compat/thrust/system/detail/generic/gather.h
new file mode 100644
index 0000000..cfb6f85
--- /dev/null
+++ b/compat/thrust/system/detail/generic/gather.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(thrust::execution_policy<ExecutionPolicy> &exec,
+                        InputIterator                              map_first,
+                        InputIterator                              map_last,
+                        RandomAccessIterator                       input_first,
+                        OutputIterator                             result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(thrust::execution_policy<ExecutionPolicy> &exec,
+                           InputIterator1                             map_first,
+                           InputIterator1                             map_last,
+                           InputIterator2                             stencil,
+                           RandomAccessIterator                       input_first,
+                           OutputIterator                             result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(thrust::execution_policy<ExecutionPolicy> &exec,
+                           InputIterator1                             map_first,
+                           InputIterator1                             map_last,
+                           InputIterator2                             stencil,
+                           RandomAccessIterator                       input_first,
+                           OutputIterator                             result,
+                           Predicate                                  pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/gather.inl>
+
diff --git a/compat/thrust/system/detail/generic/gather.inl b/compat/thrust/system/detail/generic/gather.inl
new file mode 100644
index 0000000..ab2cdd8
--- /dev/null
+++ b/compat/thrust/system/detail/generic/gather.inl
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/gather.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather(thrust::execution_policy<DerivedPolicy> &exec,
+                        InputIterator                            map_first,
+                        InputIterator                            map_last,
+                        RandomAccessIterator                     input_first,
+                        OutputIterator                           result)
+{
+  return thrust::transform(exec,
+                           thrust::make_permutation_iterator(input_first, map_first),
+                           thrust::make_permutation_iterator(input_first, map_last),
+                           result,
+                           thrust::identity<typename thrust::iterator_value<RandomAccessIterator>::type>());
+} // end gather()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator>
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator2>::type StencilType;
+  return thrust::gather_if(exec,
+                           map_first,
+                           map_last,
+                           stencil,
+                           input_first,
+                           result,
+                           thrust::identity<StencilType>());
+} // end gather_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           map_first,
+                           InputIterator1                           map_last,
+                           InputIterator2                           stencil,
+                           RandomAccessIterator                     input_first,
+                           OutputIterator                           result,
+                           Predicate                                pred)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
+  return thrust::transform_if(exec,
+                              thrust::make_permutation_iterator(input_first, map_first),
+                              thrust::make_permutation_iterator(input_first, map_last),
+                              stencil,
+                              result,
+                              thrust::identity<InputType>(),
+                              pred);
+} // end gather_if()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/generate.h b/compat/thrust/system/detail/generic/generate.h
new file mode 100644
index 0000000..e7a8e00
--- /dev/null
+++ b/compat/thrust/system/detail/generic/generate.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Generator>
+  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen);
+
+template<typename ExecutionPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/generate.inl>
+
diff --git a/compat/thrust/system/detail/generic/generate.inl b/compat/thrust/system/detail/generic/generate.inl
new file mode 100644
index 0000000..4da5763
--- /dev/null
+++ b/compat/thrust/system/detail/generic/generate.inl
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/generate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Generator>
+  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                Generator gen)
+{
+  thrust::for_each(exec, first, last, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
+} // end generate()
+
+template<typename ExecutionPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename Generator>
+  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            Generator gen)
+{
+  return thrust::for_each_n(exec, first, n, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
+} // end generate()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/inner_product.h b/compat/thrust/system/detail/generic/inner_product.h
new file mode 100644
index 0000000..9ac5c69
--- /dev/null
+++ b/compat/thrust/system/detail/generic/inner_product.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
+  OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputType init);
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/inner_product.inl>
+
diff --git a/compat/thrust/system/detail/generic/inner_product.inl b/compat/thrust/system/detail/generic/inner_product.inl
new file mode 100644
index 0000000..b6a339e
--- /dev/null
+++ b/compat/thrust/system/detail/generic/inner_product.inl
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/inner_product.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/transform_reduce.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init)
+{
+  thrust::plus<OutputType>       binary_op1;
+  thrust::multiplies<OutputType> binary_op2;
+  return thrust::inner_product(exec, first1, last1, first2, init, binary_op1, binary_op2);
+} // end inner_product()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
+OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first1,
+                         InputIterator1 last1,
+                         InputIterator2 first2,
+                         OutputType init, 
+                         BinaryFunction1 binary_op1,
+                         BinaryFunction2 binary_op2)
+{
+  typedef thrust::zip_iterator<thrust::tuple<InputIterator1,InputIterator2> > ZipIter;
+
+  ZipIter first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
+
+  // only the first iterator in the tuple is relevant for the purposes of last
+  ZipIter last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
+
+  return thrust::transform_reduce(exec, first, last, thrust::detail::zipped_binary_op<OutputType,BinaryFunction2>(binary_op2), init, binary_op1);
+} // end inner_product()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/logical.h b/compat/thrust/system/detail/generic/logical.h
new file mode 100644
index 0000000..e0d01e3
--- /dev/null
+++ b/compat/thrust/system/detail/generic/logical.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/find.h>
+#include <thrust/logical.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
+bool all_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return thrust::find_if(exec, first, last, thrust::detail::not1(pred)) == last;
+}
+
+template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
+bool any_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return thrust::find_if(exec, first, last, pred) != last;
+}
+
+template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
+bool none_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
+{
+  return !thrust::any_of(exec, first, last, pred);
+}
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/memory.h b/compat/thrust/system/detail/generic/memory.h
new file mode 100644
index 0000000..c0fe623
--- /dev/null
+++ b/compat/thrust/system/detail/generic/memory.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/memory.h
+ *  \brief Generic implementation of memory functions.
+ *         Calling some of these is an error. They have no implementation.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/pair.h>
+#include <thrust/system/detail/generic/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename Size> void malloc(thrust::execution_policy<DerivedPolicy> &, Size);
+
+template<typename T, typename DerivedPolicy>
+thrust::pointer<T,DerivedPolicy> malloc(thrust::execution_policy<DerivedPolicy> &s, std::size_t n);
+
+template<typename DerivedPolicy, typename Pointer> void free(thrust::execution_policy<DerivedPolicy> &, Pointer);
+
+template<typename Pointer1, typename Pointer2>
+__host__ __device__
+void assign_value(tag, Pointer1, Pointer2);
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer);
+
+template<typename Pointer1, typename Pointer2>
+__host__ __device__
+void iter_swap(tag, Pointer1, Pointer2);
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/memory.inl>
+
diff --git a/compat/thrust/system/detail/generic/memory.inl b/compat/thrust/system/detail/generic/memory.inl
new file mode 100644
index 0000000..f89a763
--- /dev/null
+++ b/compat/thrust/system/detail/generic/memory.inl
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/adl/malloc_and_free.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/malloc_and_free.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename Size>
+  void malloc(thrust::execution_policy<DerivedPolicy> &, Size)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Size, false>::value) );
+}
+
+
+template<typename T, typename DerivedPolicy>
+  thrust::pointer<T,DerivedPolicy>
+    malloc(thrust::execution_policy<DerivedPolicy> &exec, std::size_t n)
+{
+  thrust::pointer<void,DerivedPolicy> void_ptr = thrust::malloc(exec, sizeof(T) * n);
+
+  return pointer<T,DerivedPolicy>(static_cast<T*>(void_ptr.get()));
+} // end malloc()
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void free(thrust::execution_policy<DerivedPolicy> &, Pointer)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
+}
+
+
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
+__host__ __device__
+void assign_value(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
+}
+
+
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
+}
+
+
+template<typename Pointer1, typename Pointer2>
+__host__ __device__
+void iter_swap(tag, Pointer1, Pointer2)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
+}
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/merge.h b/compat/thrust/system/detail/generic/merge.h
new file mode 100644
index 0000000..5f0b996
--- /dev/null
+++ b/compat/thrust/system/detail/generic/merge.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+// XXX calling this function is an error; there is no implementation
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/merge.inl>
+
diff --git a/compat/thrust/system/detail/generic/merge.inl b/compat/thrust/system/detail/generic/merge.inl
new file mode 100644
index 0000000..b913611
--- /dev/null
+++ b/compat/thrust/system/detail/generic/merge.inl
@@ -0,0 +1,125 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/merge.h>
+#include <thrust/merge.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result,
+                       StrictWeakOrdering comp)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  return result;
+} // end merge()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 first1,
+                       InputIterator1 last1,
+                       InputIterator2 first2,
+                       InputIterator2 last2,
+                       OutputIterator result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::merge(exec,first1,last1,first2,last2,result,thrust::less<value_type>());
+} // end merge()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result,
+                 Compare comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<Compare> comp_first(comp);
+
+  iterator_tuple3 result = thrust::merge(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end merge_by_key()
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                 InputIterator1 keys_first1, InputIterator1 keys_last1,
+                 InputIterator2 keys_first2, InputIterator2 keys_last2,
+                 InputIterator3 values_first1, InputIterator4 values_first2,
+                 OutputIterator1 keys_result,
+                 OutputIterator2 values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::merge_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end merge_by_key()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/mismatch.h b/compat/thrust/system/detail/generic/mismatch.h
new file mode 100644
index 0000000..dc581ff
--- /dev/null
+++ b/compat/thrust/system/detail/generic/mismatch.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2,
+             BinaryPredicate pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/mismatch.inl>
+
diff --git a/compat/thrust/system/detail/generic/mismatch.inl b/compat/thrust/system/detail/generic/mismatch.inl
new file mode 100644
index 0000000..923c27f
--- /dev/null
+++ b/compat/thrust/system/detail/generic/mismatch.inl
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/mismatch.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/find.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type InputType1;
+  
+  // XXX use a placeholder expression here
+  return thrust::mismatch(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
+} // end mismatch()
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
+  thrust::pair<InputIterator1, InputIterator2>
+    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2,
+             BinaryPredicate pred)
+{
+  // Contributed by Erich Elsen
+  typedef thrust::tuple<InputIterator1,InputIterator2> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>          ZipIterator;
+  
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
+  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
+  
+  ZipIterator result = thrust::find_if_not(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<BinaryPredicate>(pred));
+  
+  return thrust::make_pair(thrust::get<0>(result.get_iterator_tuple()),
+                           thrust::get<1>(result.get_iterator_tuple()));
+} // end mismatch()
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/partition.h b/compat/thrust/system/detail/generic/partition.h
new file mode 100644
index 0000000..63daa1d
--- /dev/null
+++ b/compat/thrust/system/detail/generic/partition.h
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Generic implementations of partition functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition_point(thrust::execution_policy<ExecutionPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Predicate>
+  bool is_partitioned(thrust::execution_policy<ExecutionPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/partition.inl>
+
diff --git a/compat/thrust/system/detail/generic/partition.inl b/compat/thrust/system/detail/generic/partition.inl
new file mode 100644
index 0000000..3298afc
--- /dev/null
+++ b/compat/thrust/system/detail/generic/partition.inl
@@ -0,0 +1,238 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/partition.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+#include <thrust/remove.h>
+#include <thrust/count.h>
+#include <thrust/advance.h>
+#include <thrust/partition.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // count the size of the true partition
+  typename thrust::iterator_difference<ForwardIterator>::type num_true = thrust::count_if(exec, first,last,pred);
+
+  // point to the beginning of the false partition
+  ForwardIterator out_false = first;
+  thrust::advance(out_false, num_true);
+
+  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), first, out_false, pred).first;
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // count the size of the true partition
+  InputIterator stencil_last = stencil;
+  thrust::advance(stencil_last, temp.size());
+  typename thrust::iterator_difference<InputIterator>::type num_true = thrust::count_if(exec, stencil, stencil_last, pred);
+
+  // point to the beginning of the false partition
+  ForwardIterator out_false = first;
+  thrust::advance(out_false, num_true);
+
+  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), stencil, first, out_false, pred).first;
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  thrust::detail::unary_negate<Predicate> not_pred(pred);
+
+  // remove_copy_if the true partition to out_true
+  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, out_true, not_pred);
+
+  // remove_copy_if the false partition to out_false
+  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, out_false, pred);
+
+  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  thrust::detail::unary_negate<Predicate> not_pred(pred);
+
+  // remove_copy_if the true partition to out_true
+  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, stencil, out_true, not_pred);
+
+  // remove_copy_if the false partition to out_false
+  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, stencil, out_false, pred);
+
+  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  return thrust::stable_partition(exec, first, last, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  return thrust::stable_partition(exec, first, last, stencil, pred);
+} // end partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator first,
+                   InputIterator last,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  return thrust::stable_partition_copy(exec,first,last,out_true,out_false,pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                   InputIterator1 first,
+                   InputIterator1 last,
+                   InputIterator2 stencil,
+                   OutputIterator1 out_true,
+                   OutputIterator2 out_false,
+                   Predicate pred)
+{
+  return thrust::stable_partition_copy(exec,first,last,stencil,out_true,out_false,pred);
+} // end partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition_point(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Predicate pred)
+{
+  return thrust::find_if_not(exec, first, last, pred);
+} // end partition_point()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Predicate>
+  bool is_partitioned(thrust::execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  return thrust::is_sorted(exec,
+                           thrust::make_transform_iterator(first, thrust::detail::not1(pred)),
+                           thrust::make_transform_iterator(last,  thrust::detail::not1(pred)));
+} // end is_partitioned()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/reduce.h b/compat/thrust/system/detail/generic/reduce.h
new file mode 100644
index 0000000..2811df1
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reduce.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename InputIterator>
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last);
+
+template<typename DerivedPolicy, typename InputIterator, typename T>
+  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename T,
+         typename BinaryFunction>
+  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init, BinaryFunction binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reduce.inl>
+
diff --git a/compat/thrust/system/detail/generic/reduce.inl b/compat/thrust/system/detail/generic/reduce.inl
new file mode 100644
index 0000000..8f52385
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reduce.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/reduce.h>
+#include <thrust/system/detail/generic/reduce.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/detail/static_assert.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy, typename InputIterator>
+  typename thrust::iterator_traits<InputIterator>::value_type
+    reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type InputType;
+
+  // use InputType(0) as init by default
+  return thrust::reduce(exec, first, last, InputType(0));
+} // end reduce()
+
+
+template<typename ExecutionPolicy, typename InputIterator, typename T>
+  T reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, T init)
+{
+  // use plus<T> by default
+  return thrust::reduce(exec, first, last, init, thrust::plus<T>());
+} // end reduce()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator,
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(thrust::execution_policy<ExecutionPolicy> &exec,
+                    RandomAccessIterator first,
+                    RandomAccessIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  // unimplemented
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
+  return OutputType();
+} // end reduce()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/reduce_by_key.h b/compat/thrust/system/detail/generic/reduce_by_key.h
new file mode 100644
index 0000000..c6064ab
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reduce_by_key.h
@@ -0,0 +1,86 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reduce_by_key.inl>
+
diff --git a/compat/thrust/system/detail/generic/reduce_by_key.inl b/compat/thrust/system/detail/generic/reduce_by_key.inl
new file mode 100644
index 0000000..2ca21a5
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reduce_by_key.inl
@@ -0,0 +1,212 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_by_key.inl
+ *  \brief Inline file for reduce_by_key.h.
+ */
+
+#pragma once
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/transform.h>
+#include <thrust/scatter.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <limits>
+
+#include <thrust/detail/internal_functional.h>
+#include <thrust/scan.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template <typename ValueType, typename TailFlagType, typename AssociativeOperator>
+struct reduce_by_key_functor
+{
+    AssociativeOperator binary_op;
+
+    typedef typename thrust::tuple<ValueType, TailFlagType> result_type;
+
+    __host__ __device__
+    reduce_by_key_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
+
+    __host__ __device__
+    result_type operator()(result_type a, result_type b)
+    {
+        return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
+                           thrust::get<1>(a) | thrust::get<1>(b));
+    }
+};
+
+} // end namespace detail
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+    typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+    typedef typename thrust::iterator_traits<InputIterator1>::value_type  KeyType;
+
+    typedef unsigned int FlagType;  // TODO use difference_type
+
+    // the pseudocode for deducing the type of the temporary used below:
+    // 
+    // if BinaryFunction is AdaptableBinaryFunction
+    //   TemporaryType = AdaptableBinaryFunction::result_type
+    // else if OutputIterator2 is a "pure" output iterator
+    //   TemporaryType = InputIterator2::value_type
+    // else
+    //   TemporaryType = OutputIterator2::value_type
+    //
+    // XXX upon c++0x, TemporaryType needs to be:
+    // result_of<BinaryFunction>::type
+
+    typedef typename thrust::detail::eval_if<
+      thrust::detail::has_result_type<BinaryFunction>::value,
+      thrust::detail::result_type<BinaryFunction>,
+      thrust::detail::eval_if<
+        thrust::detail::is_output_iterator<OutputIterator2>::value,
+        thrust::iterator_value<InputIterator2>,
+        thrust::iterator_value<OutputIterator2>
+      >
+    >::type ValueType;
+
+    if (keys_first == keys_last)
+        return thrust::make_pair(keys_output, values_output);
+
+    // input size
+    difference_type n = keys_last - keys_first;
+
+    InputIterator2 values_last = values_first + n;
+    
+    // compute head flags
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy> head_flags(exec, n);
+    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, head_flags.begin() + 1, thrust::detail::not2(binary_pred));
+    head_flags[0] = 1;
+
+    // compute tail flags
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy> tail_flags(exec, n); //COPY INSTEAD OF TRANSFORM
+    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, tail_flags.begin(), thrust::detail::not2(binary_pred));
+    tail_flags[n-1] = 1;
+
+    // scan the values by flag
+    thrust::detail::temporary_array<ValueType,ExecutionPolicy> scanned_values(exec, n);
+    thrust::detail::temporary_array<FlagType,ExecutionPolicy>  scanned_tail_flags(exec, n);
+    
+    thrust::inclusive_scan
+        (exec,
+         thrust::make_zip_iterator(thrust::make_tuple(values_first,           head_flags.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(values_last,            head_flags.end())),
+         thrust::make_zip_iterator(thrust::make_tuple(scanned_values.begin(), scanned_tail_flags.begin())),
+         detail::reduce_by_key_functor<ValueType, FlagType, BinaryFunction>(binary_op));
+
+    thrust::exclusive_scan(exec, tail_flags.begin(), tail_flags.end(), scanned_tail_flags.begin(), FlagType(0), thrust::plus<FlagType>());
+
+    // number of unique keys
+    FlagType N = scanned_tail_flags[n - 1] + 1;
+    
+    // scatter the keys and accumulated values    
+    thrust::scatter_if(exec, keys_first,            keys_last,             scanned_tail_flags.begin(), head_flags.begin(), keys_output);
+    thrust::scatter_if(exec, scanned_values.begin(), scanned_values.end(), scanned_tail_flags.begin(), tail_flags.begin(), values_output);
+
+    return thrust::make_pair(keys_output + N, values_output + N); 
+} // end reduce_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type KeyType;
+
+  // use equal_to<KeyType> as default BinaryPredicate
+  return thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
+} // end reduce_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred)
+{
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_output_iterator<OutputIterator2>::value,
+    thrust::iterator_value<InputIterator2>,
+    thrust::iterator_value<OutputIterator2>
+  >::type T;
+
+  // use plus<T> as default BinaryFunction
+  return thrust::reduce_by_key(exec,
+                               keys_first, keys_last, 
+                               values_first,
+                               keys_output,
+                               values_output,
+                               binary_pred,
+                               thrust::plus<T>());
+} // end reduce_by_key()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/remove.h b/compat/thrust/system/detail/generic/remove.h
new file mode 100644
index 0000000..e236735
--- /dev/null
+++ b/compat/thrust/system/detail/generic/remove.h
@@ -0,0 +1,100 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Generic implementations of remove functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/remove.inl>
+
diff --git a/compat/thrust/system/detail/generic/remove.inl b/compat/thrust/system/detail/generic/remove.inl
new file mode 100644
index 0000000..8a533e0
--- /dev/null
+++ b/compat/thrust/system/detail/generic/remove.inl
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.inl
+ *  \brief Inline file for remove.h
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/remove.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         const T &value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+
+  // XXX consider using a placeholder here
+  return thrust::remove_if(exec, first, last, pred);
+} // end remove()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             const T &value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+
+  // XXX consider using a placeholder here
+  return thrust::remove_copy_if(exec, first, last, result, pred);
+} // end remove_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // create temporary storage for an intermediate result
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // remove into temp
+  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), temp.begin(), first, pred);
+} // end remove_if()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  // create temporary storage for an intermediate result
+  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
+
+  // remove into temp
+  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), stencil, first, pred);
+} // end remove_if() 
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::remove_copy_if(exec, first, last, first, result, pred);
+} // end remove_copy_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  return thrust::copy_if(exec, first, last, stencil, result, thrust::detail::not1(pred));
+} // end remove_copy_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/replace.h b/compat/thrust/system/detail/generic/replace.h
new file mode 100644
index 0000000..deb2e55
--- /dev/null
+++ b/compat/thrust/system/detail/generic/replace.h
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator1 first,
+                                 InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value);
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void replace(thrust::execution_policy<DerivedPolicy> &exec,
+               ForwardIterator first,
+               ForwardIterator last,
+               const T &old_value,
+               const T &new_value);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/replace.inl>
+
diff --git a/compat/thrust/system/detail/generic/replace.inl b/compat/thrust/system/detail/generic/replace.inl
new file mode 100644
index 0000000..52e7118
--- /dev/null
+++ b/compat/thrust/system/detail/generic/replace.inl
@@ -0,0 +1,168 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/replace.h>
+#include <thrust/transform.h>
+#include <thrust/replace.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+// this functor receives x, and returns a new_value if predicate(x) is true; otherwise,
+// it returns x
+template<typename Predicate, typename NewType, typename OutputType>
+  struct new_value_if
+{
+  new_value_if(Predicate p, NewType nv):pred(p),new_value(nv){}
+
+  template<typename InputType>
+  __host__ __device__
+  OutputType operator()(const InputType x) const
+  {
+    return pred(x) ? new_value : x;
+  } // end operator()()
+
+  // this version of operator()() works like the previous but
+  // feeds its second argument to pred
+  template<typename InputType, typename PredicateArgumentType>
+  __host__ __device__
+  OutputType operator()(const InputType x, const PredicateArgumentType y)
+  {
+    return pred(y) ? new_value : x;
+  } // end operator()()
+  
+  Predicate pred;
+  NewType new_value;
+}; // end new_value_if
+
+// this unary functor ignores its argument and returns a constant
+template<typename T>
+  struct constant_unary
+{
+  constant_unary(T _c):c(_c){}
+
+  template<typename U>
+  __host__ __device__
+  T operator()(U &x)
+  {
+    return c;
+  } // end operator()()
+
+  T c;
+}; // end constant_unary
+
+} // end detail
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator first,
+                                 InputIterator last,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
+  return thrust::transform(exec, first, last, result, op);
+} // end replace_copy_if()
+
+template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
+  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
+                                 InputIterator1 first,
+                                 InputIterator1 last,
+                                 InputIterator2 stencil,
+                                 OutputIterator result,
+                                 Predicate pred,
+                                 const T &new_value)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+
+  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
+  return thrust::transform(exec, first, last, stencil, result, op);
+} // end replace_copy_if()
+
+
+template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
+  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              OutputIterator result,
+                              const T &old_value,
+                              const T &new_value)
+{
+  thrust::detail::equal_to_value<T> pred(old_value);
+  return thrust::replace_copy_if(exec, first, last, result, pred, new_value);
+} // end replace_copy()
+
+template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  Predicate pred,
+                  const T &new_value)
+{
+  detail::constant_unary<T> f(new_value);
+
+  // XXX replace this with generate_if:
+  // constant_nullary<T> f(new_value);
+  // generate_if(first, last, first, f, pred);
+  thrust::transform_if(exec, first, last, first, first, f, pred);
+} // end replace_if()
+
+template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
+  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator first,
+                  ForwardIterator last,
+                  InputIterator stencil,
+                  Predicate pred,
+                  const T &new_value)
+{
+  detail::constant_unary<T> f(new_value);
+
+  // XXX replace this with generate_if:
+  // constant_nullary<T> f(new_value);
+  // generate_if(stencil, stencil + n, first, f, pred);
+  thrust::transform_if(exec, first, last, stencil, first, f, pred);
+} // end replace_if()
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void replace(thrust::execution_policy<DerivedPolicy> &exec,
+               ForwardIterator first,
+               ForwardIterator last,
+               const T &old_value,
+               const T &new_value)
+{
+  thrust::detail::equal_to_value<T> pred(old_value);
+  return thrust::replace_if(exec, first, last, pred, new_value);
+} // end replace()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/reverse.h b/compat/thrust/system/detail/generic/reverse.h
new file mode 100644
index 0000000..327bf22
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reverse.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy, typename BidirectionalIterator>
+  void reverse(thrust::execution_policy<DerivedPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last);
+
+template<typename DerivedPolicy,
+         typename BidirectionalIterator,
+         typename OutputIterator>
+  OutputIterator reverse_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/reverse.inl>
+
diff --git a/compat/thrust/system/detail/generic/reverse.inl b/compat/thrust/system/detail/generic/reverse.inl
new file mode 100644
index 0000000..27c1bbf
--- /dev/null
+++ b/compat/thrust/system/detail/generic/reverse.inl
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/reverse.h>
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/detail/copy.h>
+#include <thrust/swap.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy, typename BidirectionalIterator>
+  void reverse(thrust::execution_policy<ExecutionPolicy> &exec,
+               BidirectionalIterator first,
+               BidirectionalIterator last)
+{
+  typedef typename thrust::iterator_difference<BidirectionalIterator>::type difference_type;
+
+  // find the midpoint of [first,last)
+  difference_type N = thrust::distance(first, last);
+  BidirectionalIterator mid(first);
+  thrust::advance(mid, N / 2);
+
+  // swap elements of [first,mid) with [last - 1, mid)
+  thrust::swap_ranges(exec, first, mid, thrust::make_reverse_iterator(last));
+} // end reverse()
+
+template<typename ExecutionPolicy,
+         typename BidirectionalIterator,
+         typename OutputIterator>
+  OutputIterator reverse_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                              BidirectionalIterator first,
+                              BidirectionalIterator last,
+                              OutputIterator result)
+{
+  return thrust::copy(exec,
+                      thrust::make_reverse_iterator(last),
+                      thrust::make_reverse_iterator(first),
+                      result);
+} // end reverse_copy()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+
diff --git a/compat/thrust/system/detail/generic/scalar/binary_search.h b/compat/thrust/system/detail/generic/scalar/binary_search.h
new file mode 100644
index 0000000..6ed9e8d
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scalar/binary_search.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+namespace generic
+{
+
+namespace scalar
+{
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+  pair<RandomAccessIterator,RandomAccessIterator>
+    equal_range(RandomAccessIterator first, RandomAccessIterator last,
+                const T &val,
+                BinaryPredicate comp);
+
+template<typename RandomAccessIterator, typename T, typename Compare>
+__host__ __device__
+bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp);
+
+} // end scalar
+
+} // end generic
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
+#include <thrust/system/detail/generic/scalar/binary_search.inl>
+
diff --git a/compat/thrust/system/detail/generic/scalar/binary_search.inl b/compat/thrust/system/detail/generic/scalar/binary_search.inl
new file mode 100644
index 0000000..5a9d379
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scalar/binary_search.inl
@@ -0,0 +1,159 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/function.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+namespace detail
+{
+
+namespace generic
+{
+
+namespace scalar
+{
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::host_device_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  Size start = 0, i;
+  while(start < n)
+  {
+    i = (start + n) / 2;
+    if(wrapped_comp(first[i], val))
+    {
+      start = i + 1;
+    }
+    else
+    {
+      n = i;
+    }
+  } // end while
+  
+  return first + start;
+}
+
+// XXX generalize these upon implementation of scalar::distance & scalar::advance
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp)
+{
+  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
+  return lower_bound_n(first, n, val, comp);
+}
+
+template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound_n(RandomAccessIterator first,
+                                   Size n,
+                                   const T &val,
+                                   BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::host_device_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  Size start = 0, i;
+  while(start < n)
+  {
+    i = (start + n) / 2;
+    if(wrapped_comp(val, first[i]))
+    {
+      n = i;
+    }
+    else
+    {
+      start = i + 1;
+    }
+  } // end while
+  
+  return first + start;
+}
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
+                                 const T &val,
+                                 BinaryPredicate comp)
+{
+  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
+  return upper_bound_n(first, n, val, comp);
+}
+
+template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
+__host__ __device__
+  pair<RandomAccessIterator,RandomAccessIterator>
+    equal_range(RandomAccessIterator first, RandomAccessIterator last,
+                const T &val,
+                BinaryPredicate comp)
+{
+  RandomAccessIterator lb = thrust::system::detail::generic::scalar::lower_bound(first, last, val, comp);
+  return thrust::make_pair(lb, thrust::system::detail::generic::scalar::upper_bound(lb, last, val, comp));
+}
+
+
+template<typename RandomAccessIterator, typename T, typename Compare>
+__host__ __device__
+bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp)
+{
+  RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(first, last, value, comp);
+
+  // wrap comp
+  thrust::detail::host_device_function<
+    Compare,
+    bool
+  > wrapped_comp(comp);
+
+  return iter != last && !wrapped_comp(value,*iter);
+}
+
+} // end scalar
+
+} // end generic
+
+} // end detail
+
+} // end system
+
+} // end thrust
+
+#include <thrust/system/detail/generic/scalar/binary_search.inl>
+
diff --git a/compat/thrust/system/detail/generic/scan.h b/compat/thrust/system/detail/generic/scan.h
new file mode 100644
index 0000000..205f87f
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scan.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+// XXX it is an error to call this function; it has no implementation 
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init);
+
+
+// XXX it is an error to call this function; it has no implementation 
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scan.inl>
+
diff --git a/compat/thrust/system/detail/generic/scan.inl b/compat/thrust/system/detail/generic/scan.inl
new file mode 100644
index 0000000..33e0803
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scan.inl
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/scan.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/scan.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+
+  typedef typename thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+  >::type ValueType;
+
+  // assume plus as the associative operator
+  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<ValueType>());
+} // end inclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+
+  typedef typename thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+  >::type ValueType;
+
+  // assume 0 as the initialization value
+  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+} // end exclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init)
+{
+  // assume plus as the associative operator
+  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<T>());
+} // end exclusive_scan()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  return result;
+} // end inclusive_scan
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  return result;
+} // end exclusive_scan()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/scan_by_key.h b/compat/thrust/system/detail/generic/scan_by_key.h
new file mode 100644
index 0000000..160121b
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scan_by_key.h
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan_by_key.h
+ *  \brief Generic implementations of key-value scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scan_by_key.inl>
+
diff --git a/compat/thrust/system/detail/generic/scan_by_key.inl b/compat/thrust/system/detail/generic/scan_by_key.inl
new file mode 100644
index 0000000..d866dde
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scan_by_key.inl
@@ -0,0 +1,239 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/scan_by_key.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/replace.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template <typename OutputType, typename HeadFlagType, typename AssociativeOperator>
+struct segmented_scan_functor
+{
+    AssociativeOperator binary_op;
+
+    typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
+
+    __host__ __device__
+    segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
+
+    __host__ __device__
+    result_type operator()(result_type a, result_type b)
+    {
+        return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
+                           thrust::get<1>(a) | thrust::get<1>(b));
+    }
+};
+
+} // end namespace detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  typedef unsigned int HeadFlagType;
+
+  const size_t n = last1 - first1;
+
+  if(n != 0)
+  {
+    // compute head flags
+    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
+    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
+
+    // scan key-flag tuples, 
+    // For additional details refer to Section 2 of the following paper
+    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
+    //    NVIDIA Technical Report NVR-2008-003, December 2008
+    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
+    thrust::inclusive_scan
+        (exec,
+         thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n,
+         thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
+         detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
+  }
+
+  return result + n;
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename AssociativeOperator>
+  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                       InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       AssociativeOperator binary_op)
+{
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
+  typedef unsigned int HeadFlagType;
+
+  const size_t n = last1 - first1;
+
+  if(n != 0)
+  {
+    InputIterator2 last2 = first2 + n;
+
+    // compute head flags
+    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
+    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
+
+    // shift input one to the right and initialize segments with init
+    thrust::detail::temporary_array<OutputType,DerivedPolicy> temp(exec, n);
+    thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
+    temp[0] = init;
+
+    // scan key-flag tuples, 
+    // For additional details refer to Section 2 of the following paper
+    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
+    //    NVIDIA Technical Report NVR-2008-003, December 2008
+    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
+    thrust::inclusive_scan(exec,
+                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n,
+                           thrust::make_zip_iterator(thrust::make_tuple(result,       flags.begin())),
+                           detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
+  }
+
+  return result + n;
+}
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/scatter.h b/compat/thrust/system/detail/generic/scatter.h
new file mode 100644
index 0000000..858d11a
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scatter.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/scatter.inl>
+
diff --git a/compat/thrust/system/detail/generic/scatter.inl b/compat/thrust/system/detail/generic/scatter.inl
new file mode 100644
index 0000000..8c40359
--- /dev/null
+++ b/compat/thrust/system/detail/generic/scatter.inl
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/scatter.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename RandomAccessIterator>
+  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
+               InputIterator1 first,
+               InputIterator1 last,
+               InputIterator2 map,
+               RandomAccessIterator output)
+{
+  thrust::transform(exec,
+                    first,
+                    last,
+                    thrust::make_permutation_iterator(output, map),
+                    thrust::identity<typename thrust::iterator_value<InputIterator1>::type>());
+} // end scatter()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator>
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output)
+{
+  // default predicate is identity
+  typedef typename thrust::iterator_value<InputIterator3>::type StencilType;
+  thrust::scatter_if(exec, first, last, map, stencil, output, thrust::identity<StencilType>());
+} // end scatter_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename RandomAccessIterator,
+         typename Predicate>
+  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 first,
+                  InputIterator1 last,
+                  InputIterator2 map,
+                  InputIterator3 stencil,
+                  RandomAccessIterator output,
+                  Predicate pred)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type InputType;
+  thrust::transform_if(exec, first, last, stencil, thrust::make_permutation_iterator(output, map), thrust::identity<InputType>(), pred);
+} // end scatter_if()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/select_system.h b/compat/thrust/system/detail/generic/select_system.h
new file mode 100644
index 0000000..250a0bc
--- /dev/null
+++ b/compat/thrust/system/detail/generic/select_system.h
@@ -0,0 +1,182 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/is_metafunction_defined.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/system/detail/generic/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace select_system_detail
+{
+
+
+// min_system case 1: both systems have the same type, just return the first one
+template<typename System>
+__host__ __device__
+System &min_system(thrust::execution_policy<System> &system1,
+                   thrust::execution_policy<System> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 2: systems have differing type and the first type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if<
+    thrust::detail::is_same<
+      System1,
+      typename thrust::detail::minimum_system<System1,System2>::type
+    >::value,
+    System1 &
+  >::type
+    min_system(thrust::execution_policy<System1> &system1, thrust::execution_policy<System2> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 3: systems have differing type and the second type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if<
+    thrust::detail::is_same<
+      System2,
+      typename thrust::detail::minimum_system<System1,System2>::type
+    >::value,
+    System2 &
+  >::type
+    min_system(thrust::execution_policy<System1> &, thrust::execution_policy<System2> &system2)
+{
+  return thrust::detail::derived_cast(system2);
+} // end min_system()
+
+
+} // end select_system_detail
+
+
+template<typename System>
+__host__ __device__
+  typename thrust::detail::disable_if<
+    select_system1_exists<System>::value,
+    System &
+  >::type
+    select_system(thrust::execution_policy<System> &system)
+{
+  return thrust::detail::derived_cast(system);
+} // end select_system()
+
+
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if_defined<
+    thrust::detail::minimum_system<System1,System2>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2)
+{
+  return select_system_detail::min_system(system1,system2);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system3_exists<System1,System2,System3>::value,
+    thrust::detail::minimum_system<System1,System2,System3>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3)
+{
+  return select_system(select_system(system1,system2), system3);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system4_exists<System1,System2,System3,System4>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4)
+{
+  return select_system(select_system(system1,system2,system3), system4);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system5_exists<System1,System2,System3,System4,System5>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5)
+{
+  return select_system(select_system(system1,system2,system3,system4), system5);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system6_exists<System1,System2,System3,System4,System5,System6>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5,System6>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5,
+                   thrust::execution_policy<System6> &system6)
+{
+  return select_system(select_system(system1,system2,system3,system4,system5), system6);
+} // end select_system()
+
+
+// map a single any_system_tag to device_system_tag
+inline __host__ __device__
+thrust::device_system_tag select_system(thrust::any_system_tag)
+{
+  return thrust::device_system_tag();
+} // end select_system()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/sequence.h b/compat/thrust/system/detail/generic/sequence.h
new file mode 100644
index 0000000..b23a7b5
--- /dev/null
+++ b/compat/thrust/system/detail/generic/sequence.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init);
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/sequence.inl>
+
diff --git a/compat/thrust/system/detail/generic/sequence.inl b/compat/thrust/system/detail/generic/sequence.inl
new file mode 100644
index 0000000..45aec69
--- /dev/null
+++ b/compat/thrust/system/detail/generic/sequence.inl
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/sequence.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/functional.h>
+#include <thrust/tabulate.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type T;
+
+  thrust::sequence(exec, first, last, T(0));
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init)
+{
+  thrust::sequence(exec, first, last, init, T(1));
+} // end sequence()
+
+
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                T init,
+                T step)
+{
+  thrust::tabulate(exec, first, last, init + step * thrust::placeholders::_1);
+} // end sequence()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/set_operations.h b/compat/thrust/system/detail/generic/set_operations.h
new file mode 100644
index 0000000..1ca8d39
--- /dev/null
+++ b/compat/thrust/system/detail/generic/set_operations.h
@@ -0,0 +1,303 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator1                             first1,
+                                InputIterator1                             last1,
+                                InputIterator2                             first2,
+                                InputIterator2                             last2,
+                                OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
+                                InputIterator1                             first1,
+                                InputIterator1                             last1,
+                                InputIterator2                             first2,
+                                InputIterator2                             last2,
+                                OutputIterator                             result,
+                                StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                          InputIterator1                             keys_first1,
+                          InputIterator1                             keys_last1,
+                          InputIterator2                             keys_first2,
+                          InputIterator2                             keys_last2,
+                          InputIterator3                             values_first1,
+                          InputIterator4                             values_first2,
+                          OutputIterator1                            keys_result,
+                          OutputIterator2                            values_result,
+                          StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(thrust::execution_policy<ExecutionPolicy> &system,
+                                  InputIterator1                             first1,
+                                  InputIterator1                             last1,
+                                  InputIterator2                             first2,
+                                  InputIterator2                             last2,
+                                  OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(thrust::execution_policy<StrictWeakOrdering> &system,
+                                  InputIterator1                                first1,
+                                  InputIterator1                                last1,
+                                  InputIterator2                                first2,
+                                  InputIterator2                                last2,
+                                  OutputIterator                                result,
+                                  StrictWeakOrdering                            comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                            InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                            InputIterator1                             keys_first1,
+                            InputIterator1                             keys_last1,
+                            InputIterator2                             keys_first2,
+                            InputIterator2                             keys_last2,
+                            InputIterator3                             values_first1,
+                            OutputIterator1                            keys_result,
+                            OutputIterator2                            values_result,
+                            StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
+                                          InputIterator1                             first1,
+                                          InputIterator1                             last1,
+                                          InputIterator2                             first2,
+                                          InputIterator2                             last2,
+                                          OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
+                                          InputIterator1                             first1,
+                                          InputIterator1                             last1,
+                                          InputIterator2                             first2,
+                                          InputIterator2                             last2,
+                                          OutputIterator                             result,
+                                          StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                                    InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                                    InputIterator1                             keys_first1,
+                                    InputIterator1                             keys_last1,
+                                    InputIterator2                             keys_first2,
+                                    InputIterator2                             keys_last2,
+                                    InputIterator3                             values_first1,
+                                    InputIterator4                             values_first2,
+                                    OutputIterator1                            keys_result,
+                                    OutputIterator2                            values_result,
+                                    StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
+                           InputIterator1                             first1,
+                           InputIterator1                             last1,
+                           InputIterator2                             first2,
+                           InputIterator2                             last2,
+                           OutputIterator                             result);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
+                           InputIterator1                             first1,
+                           InputIterator1                             last1,
+                           InputIterator2                             first2,
+                           InputIterator2                             last2,
+                           OutputIterator                             result,
+                           StrictWeakOrdering                         comp);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                     InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
+                     InputIterator1                             keys_first1,
+                     InputIterator1                             keys_last1,
+                     InputIterator2                             keys_first2,
+                     InputIterator2                             keys_last2,
+                     InputIterator3                             values_first1,
+                     InputIterator4                             values_first2,
+                     OutputIterator1                            keys_result,
+                     OutputIterator2                            values_result,
+                     StrictWeakOrdering                         comp);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/set_operations.inl>
+
diff --git a/compat/thrust/system/detail/generic/set_operations.inl b/compat/thrust/system/detail/generic/set_operations.inl
new file mode 100644
index 0000000..bac9ccd
--- /dev/null
+++ b/compat/thrust/system/detail/generic/set_operations.inl
@@ -0,0 +1,449 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/system/detail/generic/set_operations.h>
+#include <thrust/functional.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1                           first1,
+                                InputIterator1                           last1,
+                                InputIterator2                           first2,
+                                InputIterator2                           last2,
+                                OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1                           keys_first1,
+                          InputIterator1                           keys_last1,
+                          InputIterator2                           keys_first2,
+                          InputIterator2                           keys_last2,
+                          InputIterator3                           values_first1,
+                          InputIterator4                           values_first2,
+                          OutputIterator1                          keys_result,
+                          OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          InputIterator1                           keys_first1,
+                          InputIterator1                           keys_last1,
+                          InputIterator2                           keys_first2,
+                          InputIterator2                           keys_last2,
+                          InputIterator3                           values_first1,
+                          InputIterator4                           values_first2,
+                          OutputIterator1                          keys_result,
+                          OutputIterator2                          values_result,
+                          StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
+                                  InputIterator1                           first1,
+                                  InputIterator1                           last1,
+                                  InputIterator2                           first2,
+                                  InputIterator2                           last2,
+                                  OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_intersection(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                            InputIterator1                           keys_first1,
+                            InputIterator1                           keys_last1,
+                            InputIterator2                           keys_first2,
+                            InputIterator2                           keys_last2,
+                            InputIterator3                           values_first1,
+                            OutputIterator1                          keys_result,
+                            OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_intersection_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, thrust::less<value_type>());
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                            InputIterator1                           keys_first1,
+                            InputIterator1                           keys_last1,
+                            InputIterator2                           keys_first2,
+                            InputIterator2                           keys_last2,
+                            InputIterator3                           values_first1,
+                            OutputIterator1                          keys_result,
+                            OutputIterator2                          values_result,
+                            StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator2>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  // fabricate a values_first2 by "sending" keys twice
+  // it should never be dereferenced by set_intersection
+  InputIterator2 values_first2 = keys_first2;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_intersection(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_intersection_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                          InputIterator1                           first1,
+                                          InputIterator1                           last1,
+                                          InputIterator2                           first2,
+                                          InputIterator2                           last2,
+                                          OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_symmetric_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                    InputIterator1                           keys_first1,
+                                    InputIterator1                           keys_last1,
+                                    InputIterator2                           keys_first2,
+                                    InputIterator2                           keys_last2,
+                                    InputIterator3                           values_first1,
+                                    InputIterator4                           values_first2,
+                                    OutputIterator1                          keys_result,
+                                    OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_symmetric_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                                    InputIterator1                           keys_first1,
+                                    InputIterator1                           keys_last1,
+                                    InputIterator2                           keys_first2,
+                                    InputIterator2                           keys_last2,
+                                    InputIterator3                           values_first1,
+                                    InputIterator4                           values_first2,
+                                    OutputIterator1                          keys_result,
+                                    OutputIterator2                          values_result,
+                                    StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_symmetric_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_symmetric_difference_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator>
+  OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           first1,
+                           InputIterator1                           last1,
+                           InputIterator2                           first2,
+                           InputIterator2                           last2,
+                           OutputIterator                           result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_union(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
+} // end set_union()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                     InputIterator1                           keys_first1,
+                     InputIterator1                           keys_last1,
+                     InputIterator2                           keys_first2,
+                     InputIterator2                           keys_last2,
+                     InputIterator3                           values_first1,
+                     InputIterator4                           values_first2,
+                     OutputIterator1                          keys_result,
+                     OutputIterator2                          values_result)
+{
+  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
+  return thrust::set_union_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
+} // end set_union_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                     InputIterator1                           keys_first1,
+                     InputIterator1                           keys_last1,
+                     InputIterator2                           keys_first2,
+                     InputIterator2                           keys_last2,
+                     InputIterator3                           values_first1,
+                     InputIterator4                           values_first2,
+                     OutputIterator1                          keys_result,
+                     OutputIterator2                          values_result,
+                     StrictWeakOrdering                       comp)
+{
+  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
+  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
+  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
+
+  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
+  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
+  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
+
+  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
+  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
+
+  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
+  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
+
+  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
+
+  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
+
+  iterator_tuple3 result = thrust::set_union(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
+
+  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
+} // end set_union_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                InputIterator1                           first1,
+                                InputIterator1                           last1,
+                                InputIterator2                           first2,
+                                InputIterator2                           last2,
+                                OutputIterator                           result,
+                                StrictWeakOrdering                       comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  return result;
+} // end set_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
+                                  InputIterator1                           first1,
+                                  InputIterator1                           last1,
+                                  InputIterator2                           first2,
+                                  InputIterator2                           last2,
+                                  OutputIterator                           result,
+                                  StrictWeakOrdering                       comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  return result;
+} // end set_intersection()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
+                                          InputIterator1                           first1,
+                                          InputIterator1                           last1,
+                                          InputIterator2                           first2,
+                                          InputIterator2                           last2,
+                                          OutputIterator                           result,
+                                          StrictWeakOrdering                       comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  return result;
+} // end set_symmetric_difference()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1                           first1,
+                           InputIterator1                           last1,
+                           InputIterator2                           first2,
+                           InputIterator2                           last2,
+                           OutputIterator                           result,
+                           StrictWeakOrdering                       comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  return result;
+} // end set_union()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/sort.h b/compat/thrust/system/detail/generic/sort.h
new file mode 100644
index 0000000..5498708
--- /dev/null
+++ b/compat/thrust/system/detail/generic/sort.h
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(thrust::execution_policy<DerivedPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator>
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first);
+
+
+// XXX it is an error to call this function; it has no implementation
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp);
+
+
+template<typename DerivedPolicy, typename ForwardIterator>
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Compare>
+  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/sort.inl>
+
diff --git a/compat/thrust/system/detail/generic/sort.inl b/compat/thrust/system/detail/generic/sort.inl
new file mode 100644
index 0000000..aabb2ee
--- /dev/null
+++ b/compat/thrust/system/detail/generic/sort.inl
@@ -0,0 +1,202 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/sort.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/find.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator>
+  void sort(thrust::execution_policy<ExecutionPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type; 
+  thrust::sort(exec, first, last, thrust::less<value_type>());
+} // end sort()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void sort(thrust::execution_policy<ExecutionPolicy> &exec,
+            RandomAccessIterator first,
+            RandomAccessIterator last,
+            StrictWeakOrdering comp)
+{
+  // implement with stable_sort
+  thrust::stable_sort(exec, first, last, comp);
+} // end sort()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+  thrust::sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
+} // end sort_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                   RandomAccessIterator1 keys_first,
+                   RandomAccessIterator1 keys_last,
+                   RandomAccessIterator2 values_first,
+                   StrictWeakOrdering comp)
+{
+  // implement with stable_sort_by_key
+  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
+} // end sort_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator>
+  void stable_sort(thrust::execution_policy<ExecutionPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+  thrust::stable_sort(exec, first, last, thrust::less<value_type>());
+} // end stable_sort()
+
+
+template<typename ExecutionPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+  void stable_sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first)
+{
+  typedef typename iterator_value<RandomAccessIterator1>::type value_type;
+  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
+} // end stable_sort_by_key()
+
+
+template<typename ExecutionPolicy, typename ForwardIterator>
+  bool is_sorted(thrust::execution_policy<ExecutionPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  return thrust::is_sorted_until(exec, first, last) == last;
+} // end is_sorted()
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Compare>
+  bool is_sorted(thrust::execution_policy<ExecutionPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 Compare comp)
+{
+  return thrust::is_sorted_until(exec, first, last, comp) == last;
+} // end is_sorted()
+
+
+template<typename ExecutionPolicy, typename ForwardIterator>
+  ForwardIterator is_sorted_until(thrust::execution_policy<ExecutionPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type InputType;
+
+  return thrust::is_sorted_until(exec, first, last, thrust::less<InputType>());
+} // end is_sorted_until()
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Compare>
+  ForwardIterator is_sorted_until(thrust::execution_policy<ExecutionPolicy> &exec,
+                                  ForwardIterator first,
+                                  ForwardIterator last,
+                                  Compare comp)
+{
+  if(thrust::distance(first,last) < 2) return last;
+
+  typedef thrust::tuple<ForwardIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>            ZipIterator;
+
+  ForwardIterator first_plus_one = first;
+  thrust::advance(first_plus_one, 1);
+
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first_plus_one, first));
+  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last, first));
+
+  return thrust::get<0>(thrust::find_if(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<Compare>(comp)).get_iterator_tuple());
+} // end is_sorted_until()
+
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(tag,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
+} // end stable_sort()
+
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(tag,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp)
+{
+  // unimplemented primitive
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, false>::value) );
+} // end stable_sort_by_key()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/swap_ranges.h b/compat/thrust/system/detail/generic/swap_ranges.h
new file mode 100644
index 0000000..5d640d3
--- /dev/null
+++ b/compat/thrust/system/detail/generic/swap_ranges.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/swap_ranges.inl>
+
diff --git a/compat/thrust/system/detail/generic/swap_ranges.inl b/compat/thrust/system/detail/generic/swap_ranges.inl
new file mode 100644
index 0000000..0e12d07
--- /dev/null
+++ b/compat/thrust/system/detail/generic/swap_ranges.inl
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/swap_ranges.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+// XXX define this here rather than in internal_functional.h
+// to avoid circular dependence between swap.h & internal_functional.h
+struct swap_pair_elements
+{
+  template <typename Tuple>
+  __host__ __device__
+  void operator()(Tuple t)
+  {
+    // use unqualified swap to allow ADL to catch any user-defined swap
+    using thrust::swap;
+    swap(thrust::get<0>(t), thrust::get<1>(t));
+  }
+}; // end swap_pair_elements
+
+} // end detail
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
+                               ForwardIterator1 first1,
+                               ForwardIterator1 last1,
+                               ForwardIterator2 first2)
+{
+  typedef thrust::tuple<ForwardIterator1,ForwardIterator2> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple>              ZipIterator;
+
+  ZipIterator result = thrust::for_each(exec,
+                                        thrust::make_zip_iterator(thrust::make_tuple(first1, first2)),
+                                        thrust::make_zip_iterator(thrust::make_tuple(last1,  first2)),
+                                        detail::swap_pair_elements());
+  return thrust::get<1>(result.get_iterator_tuple());
+} // end swap_ranges()
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/tabulate.h b/compat/thrust/system/detail/generic/tabulate.h
new file mode 100644
index 0000000..e5911b1
--- /dev/null
+++ b/compat/thrust/system/detail/generic/tabulate.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename UnaryOperation>
+  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+template<typename DerivedPolicy,
+         typename OutputIterator,
+         typename Size,
+         typename UnaryOperation>
+  OutputIterator tabulate_n(thrust::execution_policy<DerivedPolicy> &exec,
+                            OutputIterator first,
+                            Size n,
+                            UnaryOperation unary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/tabulate.inl>
+
diff --git a/compat/thrust/system/detail/generic/tabulate.inl b/compat/thrust/system/detail/generic/tabulate.inl
new file mode 100644
index 0000000..d2ffc26
--- /dev/null
+++ b/compat/thrust/system/detail/generic/tabulate.inl
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tabulate.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/transform.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename UnaryOperation>
+  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op)
+{
+  typedef typename iterator_difference<ForwardIterator>::type difference_type;
+
+  // by default, counting_iterator uses a 64b difference_type on 32b platforms to avoid overflowing its counter.
+  // this causes problems when a zip_iterator is created in transform's implementation -- ForwardIterator is
+  // incremented by a 64b difference_type and some compilers warn
+  // to avoid this, specify the counting_iterator's difference_type to be the same as ForwardIterator's.
+  thrust::counting_iterator<difference_type, thrust::use_default, thrust::use_default, difference_type> iter(0);
+
+  thrust::transform(exec, iter, iter + thrust::distance(first, last), first, unary_op);
+} // end tabulate()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+
diff --git a/compat/thrust/system/detail/generic/tag.h b/compat/thrust/system/detail/generic/tag.h
new file mode 100644
index 0000000..577d6a3
--- /dev/null
+++ b/compat/thrust/system/detail/generic/tag.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/tag.h
+ *  \brief Implementation of the generic backend's tag.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+// tag exists only to make the generic entry points the least priority match
+// during ADL. tag should not be derived from and is constructible from anything
+struct tag
+{
+  template<typename T>
+  __host__ __device__ inline
+  tag(const T &) {}
+};
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/temporary_buffer.h b/compat/thrust/system/detail/generic/temporary_buffer.h
new file mode 100644
index 0000000..8cb08b0
--- /dev/null
+++ b/compat/thrust/system/detail/generic/temporary_buffer.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+#include <thrust/detail/pointer.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename T, typename DerivedPolicy>
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p);
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
+#include <thrust/system/detail/generic/temporary_buffer.inl>
+
diff --git a/compat/thrust/system/detail/generic/temporary_buffer.inl b/compat/thrust/system/detail/generic/temporary_buffer.inl
new file mode 100644
index 0000000..0a6be7e
--- /dev/null
+++ b/compat/thrust/system/detail/generic/temporary_buffer.inl
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/temporary_buffer.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/malloc_and_free.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename T, typename DerivedPolicy>
+  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
+    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
+{
+  thrust::pointer<T,DerivedPolicy> ptr = thrust::malloc<T>(exec, n);
+
+  // check for a failed malloc
+  if(!ptr.get())
+  {
+    n = 0;
+  } // end if
+
+  return thrust::make_pair(ptr, n);
+} // end get_temporary_buffer()
+
+
+template<typename DerivedPolicy, typename Pointer>
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p)
+{
+  thrust::free(exec, p);
+} // end return_temporary_buffer()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/transform.h b/compat/thrust/system/detail/generic/transform.h
new file mode 100644
index 0000000..e98d402
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform.inl>
+
diff --git a/compat/thrust/system/detail/generic/transform.inl b/compat/thrust/system/detail/generic/transform.inl
new file mode 100644
index 0000000..8f09953
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform.inl
@@ -0,0 +1,214 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/internal_functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op)
+{
+  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
+  //     a function which is only __host__ or __device__ by selecting a generic functor
+  //     which is one or the other
+  //     when nvcc is able to deal with this, remove this WAR
+  
+  // given the minimal system, determine the unary transform functor we need
+  typedef typename thrust::detail::unary_transform_functor<DerivedPolicy,UnaryFunction>::type UnaryTransformFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator,OutputIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
+                     UnaryTransformFunctor(op));
+
+  return thrust::get<1>(zipped_result.get_iterator_tuple());
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op)
+{
+  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
+  //     a function which is only __host__ or __device__ by selecting a generic functor
+  //     which is one or the other
+  //     when nvcc is able to deal with this, remove this WAR
+  
+  // given the minimal system, determine the binary transform functor we need
+  typedef typename thrust::detail::binary_transform_functor<DerivedPolicy,BinaryFunction>::type BinaryTransformFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,OutputIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,result)),
+                     BinaryTransformFunctor(op));
+
+  return thrust::get<2>(zipped_result.get_iterator_tuple());
+} // end transform()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator first,
+                               InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
+  //     a function which is only __host__ or __device__ by selecting a generic functor
+  //     which is one or the other
+  //     when nvcc is able to deal with this, remove this WAR
+  
+  // given the minimal system, determine the unary transform_if functor we need
+  typedef typename thrust::detail::unary_transform_if_functor<DerivedPolicy,UnaryFunction,Predicate>::type UnaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
+                     UnaryTransformIfFunctor(unary_op,pred));
+
+  return thrust::get<1>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first,
+                               InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction unary_op,
+                               Predicate pred)
+{
+  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
+  //     a function which is only __host__ or __device__ by selecting a generic functor
+  //     which is one or the other
+  //     when nvcc is able to deal with this, remove this WAR
+  
+  // given the minimal system, determine the unary transform_if functor we need
+  typedef typename thrust::detail::unary_transform_if_with_stencil_functor<DerivedPolicy,UnaryFunction,Predicate>::type UnaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first,stencil,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last,stencil,result)),
+                     UnaryTransformIfFunctor(unary_op,pred));
+
+  return thrust::get<2>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
+                               InputIterator1 first1,
+                               InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred)
+{
+  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
+  //     a function which is only __host__ or __device__ by selecting a generic functor
+  //     which is one or the other
+  //     when nvcc is able to deal with this, remove this WAR
+  
+  // given the minimal system, determine the binary transform_if functor we need
+  typedef typename thrust::detail::binary_transform_if_functor<DerivedPolicy,BinaryFunction,Predicate>::type BinaryTransformIfFunctor;
+
+  // make an iterator tuple
+  typedef thrust::tuple<InputIterator1,InputIterator2,InputIterator3,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_result =
+    thrust::for_each(exec,
+                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,stencil,result)),
+                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,stencil,result)),
+                     BinaryTransformIfFunctor(binary_op,pred));
+
+  return thrust::get<3>(zipped_result.get_iterator_tuple());
+} // end transform_if()
+
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/transform_reduce.h b/compat/thrust/system/detail/generic/transform_reduce.h
new file mode 100644
index 0000000..c1f098f
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform_reduce.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(thrust::execution_policy<ExecutionPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform_reduce.inl>
+
diff --git a/compat/thrust/system/detail/generic/transform_reduce.inl b/compat/thrust/system/detail/generic/transform_reduce.inl
new file mode 100644
index 0000000..ce8b6a1
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform_reduce.inl
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform_reduce.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(thrust::execution_policy<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op)
+{
+  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_last(last, unary_op);
+
+  return thrust::reduce(exec, xfrm_first, xfrm_last, init, binary_op);
+} // end transform_reduce()
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/transform_scan.h b/compat/thrust/system/detail/generic/transform_scan.h
new file mode 100644
index 0000000..99db86e
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform_scan.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/transform_scan.inl>
+
diff --git a/compat/thrust/system/detail/generic/transform_scan.inl b/compat/thrust/system/detail/generic/transform_scan.inl
new file mode 100644
index 0000000..a95ec20
--- /dev/null
+++ b/compat/thrust/system/detail/generic/transform_scan.inl
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/scan.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename BinaryFunction>
+  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          BinaryFunction binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if UnaryFunction is AdaptableUnaryFunction
+  //   TemporaryType = AdaptableUnaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<UnaryFunction>::type
+
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::has_result_type<UnaryFunction>::value,
+    thrust::detail::result_type<UnaryFunction>,
+    thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
+
+  return thrust::inclusive_scan(exec, _first, _last, result, binary_op);
+} // end transform_inclusive_scan()
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if UnaryFunction is AdaptableUnaryFunction
+  //   TemporaryType = AdaptableUnaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<UnaryFunction>::type
+
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::has_result_type<UnaryFunction>::value,
+    thrust::detail::result_type<UnaryFunction>,
+    thrust::detail::eval_if<
+      thrust::detail::is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
+  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
+
+  return thrust::exclusive_scan(exec, _first, _last, result, init, binary_op);
+} // end transform_exclusive_scan()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+
diff --git a/compat/thrust/system/detail/generic/type_traits.h b/compat/thrust/system/detail/generic/type_traits.h
new file mode 100644
index 0000000..4011352
--- /dev/null
+++ b/compat/thrust/system/detail/generic/type_traits.h
@@ -0,0 +1,168 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file generic/type_traits.h
+ *  \brief Introspection for free functions defined in generic.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+// forward declaration of any_system_tag for any_conversion below
+struct any_system_tag;
+
+namespace system
+{
+namespace detail
+{
+
+// we must define these traits outside of generic's namespace
+namespace generic_type_traits_ns
+{
+
+typedef char yes;
+typedef char (&no)[2];
+
+struct any_conversion
+{
+  template<typename T> any_conversion(const T &);
+
+  // add this extra constructor to disambiguate conversion from any_system_tag
+  any_conversion(const any_system_tag &);
+};
+
+namespace select_system_exists_ns
+{
+  no select_system(const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
+
+  template<typename T> yes check(const T &);
+
+  no check(no);
+
+  template<typename Tag>
+    struct select_system1_exists
+  {
+    static Tag &tag;
+
+    static const bool value = sizeof(check(select_system(tag))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2>
+    struct select_system2_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3>
+    struct select_system3_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+    struct select_system4_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+    struct select_system5_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+    static Tag5 &tag5;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5))) == sizeof(yes);
+  };
+
+  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+    struct select_system6_exists
+  {
+    static Tag1 &tag1;
+    static Tag2 &tag2;
+    static Tag3 &tag3;
+    static Tag4 &tag4;
+    static Tag5 &tag5;
+    static Tag6 &tag6;
+
+    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5,tag6))) == sizeof(yes);
+  };
+} // end select_system_exists_ns
+
+} // end generic_type_traits_ns
+
+namespace generic
+{
+
+template<typename Tag>
+  struct select_system1_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system1_exists<Tag>
+{};
+
+template<typename Tag1, typename Tag2>
+  struct select_system2_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system2_exists<Tag1,Tag2>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3>
+  struct select_system3_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system3_exists<Tag1,Tag2,Tag3>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+  struct select_system4_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system4_exists<Tag1,Tag2,Tag3,Tag4>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+  struct select_system5_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system5_exists<Tag1,Tag2,Tag3,Tag4,Tag5>
+{};
+
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+  struct select_system6_exists
+    : generic_type_traits_ns::select_system_exists_ns::select_system6_exists<Tag1,Tag2,Tag3,Tag4,Tag5,Tag6>
+{};
+
+} // end generic
+} // end detail
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/detail/generic/uninitialized_copy.h b/compat/thrust/system/detail/generic/uninitialized_copy.h
new file mode 100644
index 0000000..67e3e68
--- /dev/null
+++ b/compat/thrust/system/detail/generic/uninitialized_copy.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/uninitialized_copy.inl>
+
diff --git a/compat/thrust/system/detail/generic/uninitialized_copy.inl b/compat/thrust/system/detail/generic/uninitialized_copy.inl
new file mode 100644
index 0000000..414e6e4
--- /dev/null
+++ b/compat/thrust/system/detail/generic/uninitialized_copy.inl
@@ -0,0 +1,187 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/uninitialized_copy.h>
+#include <thrust/copy.h>
+#include <thrust/for_each.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename InputType,
+         typename OutputType>
+  struct uninitialized_copy_functor
+{
+  template<typename Tuple>
+  __host__ __device__
+  void operator()(Tuple t)
+  {
+    const InputType &in = thrust::get<0>(t);
+    OutputType &out = thrust::get<1>(t);
+
+    ::new(static_cast<void*>(&out)) OutputType(in);
+  } // end operator()()
+}; // end uninitialized_copy_functor
+
+
+// non-trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result,
+                                     thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+  ZipIterator end = begin;
+
+  // get a zip_iterator pointing to the end
+  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
+  thrust::advance(end, n);
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type InputType;
+  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
+
+  detail::uninitialized_copy_functor<InputType, OutputType> f;
+
+  // do the for_each
+  thrust::for_each(exec, begin, end, f);
+
+  // return the end of the output range
+  return thrust::get<1>(end.get_iterator_tuple());
+} // end uninitialized_copy()
+
+
+// trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result,
+                                     thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::copy(exec, first, last, result);
+} // end uninitialized_copy()
+
+
+// non-trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result,
+                                       thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  // zip up the iterators
+  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first,result));
+
+  // create a functor
+  typedef typename iterator_traits<InputIterator>::value_type   InputType;
+  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
+
+  detail::uninitialized_copy_functor<InputType, OutputType> f;
+
+  // do the for_each_n
+  ZipIterator zipped_last = thrust::for_each_n(exec, zipped_first, n, f);
+
+  // return the end of the output range
+  return thrust::get<1>(zipped_last.get_iterator_tuple());
+} // end uninitialized_copy_n()
+
+
+// trivial copy constructor path
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result,
+                                       thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::copy_n(exec, first, n, result);
+} // end uninitialized_copy_n()
+
+
+} // end detail
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
+
+  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_copy(exec, first, last, result, ResultTypeHasTrivialCopyConstructor());
+} // end uninitialized_copy()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename Size,
+         typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
+
+  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_copy_n(exec, first, n, result, ResultTypeHasTrivialCopyConstructor());
+} // end uninitialized_copy_n()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/uninitialized_fill.h b/compat/thrust/system/detail/generic/uninitialized_fill.h
new file mode 100644
index 0000000..c1df694
--- /dev/null
+++ b/compat/thrust/system/detail/generic/uninitialized_fill.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/uninitialized_fill.inl>
+
diff --git a/compat/thrust/system/detail/generic/uninitialized_fill.inl b/compat/thrust/system/detail/generic/uninitialized_fill.inl
new file mode 100644
index 0000000..bb30b24
--- /dev/null
+++ b/compat/thrust/system/detail/generic/uninitialized_fill.inl
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/uninitialized_fill.h>
+#include <thrust/fill.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x,
+                          thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  thrust::fill(exec, first, last, x);
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x,
+                          thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  thrust::for_each(exec, first, last, thrust::detail::uninitialized_fill_functor<ValueType>(x));
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x,
+                                       thrust::detail::true_type) // has_trivial_copy_constructor
+{
+  return thrust::fill_n(exec, first, n, x);
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x,
+                                       thrust::detail::false_type) // has_trivial_copy_constructor
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  return thrust::for_each_n(exec, first, n, thrust::detail::uninitialized_fill_functor<ValueType>(x));
+} // end uninitialized_fill()
+
+} // end detail
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename T>
+  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
+
+  thrust::system::detail::generic::detail::uninitialized_fill(exec, first, last, x,
+    ValueTypeHasTrivialCopyConstructor());
+} // end uninitialized_fill()
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Size,
+         typename T>
+  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x)
+{
+  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
+
+  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
+
+  return thrust::system::detail::generic::detail::uninitialized_fill_n(exec, first, n, x,
+    ValueTypeHasTrivialCopyConstructor());
+} // end uninitialized_fill()
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/unique.h b/compat/thrust/system/detail/generic/unique.h
new file mode 100644
index 0000000..57e17ca
--- /dev/null
+++ b/compat/thrust/system/detail/generic/unique.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator output,
+                           BinaryPredicate binary_pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/unique.inl>
+
diff --git a/compat/thrust/system/detail/generic/unique.inl b/compat/thrust/system/detail/generic/unique.inl
new file mode 100644
index 0000000..42d6b15
--- /dev/null
+++ b/compat/thrust/system/detail/generic/unique.inl
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.inl
+ *  \brief Inline file for unique.h.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+
+  return thrust::unique(exec, first, last, thrust::equal_to<InputType>());
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
+  
+  thrust::detail::temporary_array<InputType,DerivedPolicy> input(exec, first, last);
+  
+  return thrust::unique_copy(exec, input.begin(), input.end(), first, binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output)
+{
+  typedef typename thrust::iterator_value<InputIterator>::type value_type;
+  return thrust::unique_copy(exec, first,last,output,thrust::equal_to<value_type>());
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  // empty sequence
+  if(first == last)
+    return output;
+  
+  thrust::detail::temporary_array<int,DerivedPolicy> stencil(exec, thrust::distance(first, last));
+  
+  // mark first element in each group
+  stencil[0] = 1; 
+  thrust::transform(exec, first, last - 1, first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
+  
+  return thrust::copy_if(exec, first, last, stencil.begin(), output, thrust::identity<int>());
+} // end unique_copy()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/generic/unique_by_key.h b/compat/thrust/system/detail/generic/unique_by_key.h
new file mode 100644
index 0000000..aa62f73
--- /dev/null
+++ b/compat/thrust/system/detail/generic/unique_by_key.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/generic/unique_by_key.inl>
+
diff --git a/compat/thrust/system/detail/generic/unique_by_key.inl b/compat/thrust/system/detail/generic/unique_by_key.inl
new file mode 100644
index 0000000..c780fa7
--- /dev/null
+++ b/compat/thrust/system/detail/generic/unique_by_key.inl
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/internal_functional.h>
+#include <thrust/detail/copy_if.h>
+#include <thrust/unique.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type KeyType;
+  return thrust::unique_by_key(exec, keys_first, keys_last, values_first, thrust::equal_to<KeyType>());
+} // end unique_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType1;
+  typedef typename thrust::iterator_traits<ForwardIterator2>::value_type InputType2;
+  
+  ForwardIterator2 values_last = values_first + (keys_last - keys_first);
+  
+  thrust::detail::temporary_array<InputType1,ExecutionPolicy> keys(exec, keys_first, keys_last);
+  thrust::detail::temporary_array<InputType2,ExecutionPolicy> vals(exec, values_first, values_last);
+  
+  return thrust::unique_by_key_copy(exec, keys.begin(), keys.end(), vals.begin(), keys_first, values_first, binary_pred);
+} // end unique_by_key()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  return thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
+} // end unique_by_key_copy()
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
+  
+  // empty sequence
+  if(keys_first == keys_last)
+    return thrust::make_pair(keys_output, values_output);
+  
+  difference_type n = thrust::distance(keys_first, keys_last);
+  
+  thrust::detail::temporary_array<int,ExecutionPolicy> stencil(exec,n);
+  
+  // mark first element in each group
+  stencil[0] = 1; 
+  thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
+  
+  thrust::zip_iterator< thrust::tuple<OutputIterator1, OutputIterator2> > result =
+    thrust::copy_if(exec,
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)) + n,
+                    stencil.begin(),
+                    thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output)),
+                    thrust::identity<int>());
+  
+  difference_type output_size = result - thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output));
+                                  
+  return thrust::make_pair(keys_output + output_size, values_output + output_size);
+} // end unique_by_key_copy()
+
+
+} // end namespace generic
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/decompose.h b/compat/thrust/system/detail/internal/decompose.h
new file mode 100644
index 0000000..dea806d
--- /dev/null
+++ b/compat/thrust/system/detail/internal/decompose.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+
+  template <typename IndexType>
+    class index_range
+    {
+      public:
+        typedef IndexType index_type;
+
+        __host__ __device__
+          index_range(index_type begin, index_type end) : m_begin(begin), m_end(end) {}
+
+        __host__ __device__
+          index_type begin(void) const { return m_begin; }
+
+        __host__ __device__
+          index_type end(void)   const { return m_end; }
+
+        __host__ __device__
+          index_type size(void)  const { return m_end - m_begin; }
+
+      private:
+        index_type m_begin;
+        index_type m_end;
+    };
+
+  template <typename IndexType>
+    class uniform_decomposition
+    {
+      public:
+        typedef IndexType               index_type;
+        typedef index_range<index_type> range_type;
+
+        uniform_decomposition(index_type N, index_type granularity, index_type max_intervals)
+          : m_N(N),
+	    m_intervals((N + granularity - 1) / granularity),
+	    m_threshold(0),
+	    m_small_interval(granularity),
+	    m_large_interval(0)
+        {
+	  if(m_intervals > max_intervals)
+          {
+	    m_small_interval = granularity * (m_intervals / max_intervals);
+	    m_large_interval = m_small_interval + granularity;
+	    m_threshold      = m_intervals % max_intervals;
+	    m_intervals      = max_intervals;
+	  }
+        }
+
+        __host__ __device__
+          index_range<index_type> operator[](const index_type& i) const
+          {
+            if (i < m_threshold)
+            {
+              index_type begin = m_large_interval * i;
+              index_type end   = begin + m_large_interval;
+              return range_type(begin, end);
+            }
+            else
+            {
+              index_type begin = m_large_interval * m_threshold + m_small_interval * (i - m_threshold);
+              index_type end   = (begin + m_small_interval < m_N) ? begin + m_small_interval : m_N;
+              return range_type(begin, end);
+            }
+          }
+
+        __host__ __device__
+          index_type size(void) const
+          {
+            return m_intervals;
+          }
+
+      private:
+
+        index_type m_N;
+        index_type m_intervals;
+        index_type m_threshold;
+        index_type m_small_interval;
+        index_type m_large_interval;
+    };
+
+
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/adjacent_difference.h b/compat/thrust/system/detail/internal/scalar/adjacent_difference.h
new file mode 100644
index 0000000..d1a95ae
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/adjacent_difference.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file adjacent_difference.h
+ *  \brief Sequential implementation of adjacent_difference.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction>
+OutputIterator adjacent_difference(InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator result,
+                                   BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
+
+  if (first == last)
+    return result;
+
+  InputType curr = *first;
+
+  *result = curr;
+
+  while (++first != last)
+  {
+    InputType next = *first;
+    *(++result) = binary_op(next, curr);
+    curr = next;
+  }
+
+  return ++result;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/binary_search.h b/compat/thrust/system/detail/internal/scalar/binary_search.h
new file mode 100644
index 0000000..c3ac49f
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/binary_search.h
@@ -0,0 +1,143 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file binary_search.h
+ *  \brief Sequential implementation of binary search algorithms.
+ */
+
+#pragma once
+
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename ForwardIterator,
+          typename T,
+          typename StrictWeakOrdering>
+ForwardIterator lower_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val,
+                            StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
+
+  difference_type len = thrust::distance(first, last);
+
+  while(len > 0)
+  {
+    difference_type half = len >> 1;
+    ForwardIterator middle = first;
+
+    thrust::advance(middle, half);
+
+    if(wrapped_comp(*middle, val))
+    {
+      first = middle;
+      ++first;
+      len = len - half - 1;
+    }
+    else
+    {
+      len = half;
+    }
+  }
+
+  return first;
+}
+
+
+template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator upper_bound(ForwardIterator first,
+                            ForwardIterator last,
+                            const T& val, 
+                            StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
+
+  difference_type len = thrust::distance(first, last);
+
+  while(len > 0)
+  {
+    difference_type half = len >> 1;
+    ForwardIterator middle = first;
+
+    thrust::advance(middle, half);
+
+    if(wrapped_comp(val, *middle))
+    {
+      len = half;
+    }
+    else
+    {
+      first = middle;
+      ++first;
+      len = len - half - 1;
+    }
+  }
+
+  return first;
+}
+
+template <typename ForwardIterator,
+          typename T,
+          typename StrictWeakOrdering>
+bool binary_search(ForwardIterator first,
+                   ForwardIterator last,
+                   const T& val, 
+                   StrictWeakOrdering comp)
+{
+  ForwardIterator iter = thrust::system::detail::internal::scalar::lower_bound(first, last, val, comp);
+
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  return iter != last && !wrapped_comp(val,*iter);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/copy.h b/compat/thrust/system/detail/internal/scalar/copy.h
new file mode 100644
index 0000000..42cb385
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/copy.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy.h
+ *  \brief Sequential implementations of copy algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result);
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result);
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/copy.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/copy.inl b/compat/thrust/system/detail/internal/scalar/copy.inl
new file mode 100644
index 0000000..8c9f5c2
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/copy.inl
@@ -0,0 +1,127 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/dispatch/is_trivial_copy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/internal/scalar/general_copy.h>
+#include <thrust/system/detail/internal/scalar/trivial_copy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+namespace copy_detail
+{
+
+
+// returns the raw pointer associated with a Pointer-like thing
+template<typename Pointer>
+  typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+    get(Pointer ptr)
+{
+  return thrust::detail::pointer_traits<Pointer>::get(ptr);
+}
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::detail::true_type)  // is_trivial_copy
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type Size;
+
+  const Size n = last - first;
+  thrust::system::detail::internal::scalar::trivial_copy_n(get(&*first), n, get(&*result));
+  return result + n;
+} // end copy()
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::detail::false_type)  // is_trivial_copy
+{
+  return thrust::system::detail::internal::scalar::general_copy(first,last,result);
+} // end copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::detail::true_type)  // is_trivial_copy
+{
+  thrust::system::detail::internal::scalar::trivial_copy_n(get(&*first), n, get(&*result));
+  return result + n;
+} // end copy_n()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::detail::false_type)  // is_trivial_copy
+{
+  return thrust::system::detail::internal::scalar::general_copy_n(first,n,result);
+} // end copy_n()
+
+} // end namespace copy_detail
+
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result)
+{
+  return thrust::system::detail::internal::scalar::copy_detail::copy(first, last, result,
+    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
+} // end copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(InputIterator first,
+                        Size n,
+                        OutputIterator result)
+{
+  return thrust::system::detail::internal::scalar::copy_detail::copy_n(first, n, result,
+    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
+} // end copy_n()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/copy_backward.h b/compat/thrust/system/detail/internal/scalar/copy_backward.h
new file mode 100644
index 0000000..36f8f66
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/copy_backward.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename BidirectionalIterator1,
+          typename BidirectionalIterator2>
+BidirectionalIterator2 copy_backward(BidirectionalIterator1 first, 
+                                     BidirectionalIterator1 last, 
+                                     BidirectionalIterator2 result)
+{
+  while (first != last)
+  {
+    --last;
+    --result;
+    *result = *last;
+  }
+
+  return result;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/copy_if.h b/compat/thrust/system/detail/internal/scalar/copy_if.h
new file mode 100644
index 0000000..67f9402
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/copy_if.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file copy_if.h
+ *  \brief Sequential implementation of copy_if.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  thrust::detail::host_function<Predicate,bool> wrapped_pred(pred);
+
+  while(first != last)
+  {
+    if(wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    } // end if
+
+    ++first;
+    ++stencil;
+  } // end while
+
+  return result;
+} // end copy_if()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/extrema.h b/compat/thrust/system/detail/internal/scalar/extrema.h
new file mode 100644
index 0000000..ebea756
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/extrema.h
@@ -0,0 +1,127 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file extrema.h
+ *  \brief Sequential implementations of extrema functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+ForwardIterator min_element(ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  ForwardIterator imin = first;
+
+  for (; first != last; first++)
+  {
+    if (wrapped_comp(*first, *imin))
+    {
+      imin = first;
+    }
+  }
+
+  return imin;
+}
+
+
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+ForwardIterator max_element(ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+
+  ForwardIterator imax = first;
+
+  for (; first != last; first++)
+  {
+    if (wrapped_comp(*imax, *first))
+    {
+      imax = first;
+    }
+  }
+
+  return imax;
+}
+
+
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    BinaryPredicate,
+    bool
+  > wrapped_comp(comp);
+  
+  ForwardIterator imin = first;
+  ForwardIterator imax = first;
+
+  for (; first != last; first++)
+  {
+    if (wrapped_comp(*first, *imin))
+    {
+      imin = first;
+    }
+
+    if (wrapped_comp(*imax, *first))
+    {
+      imax = first;
+    }
+  }
+
+  return thrust::make_pair(imin, imax);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/find.h b/compat/thrust/system/detail/internal/scalar/find.h
new file mode 100644
index 0000000..6b25021
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/find.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief Sequential implementation of find_if. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename InputIterator,
+          typename Predicate>
+InputIterator find_if(InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while(first != last)
+  {
+    if (wrapped_pred(*first))
+      return first;
+
+    ++first;
+  }
+
+  // return first so zip_iterator works correctly
+  return first;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/for_each.h b/compat/thrust/system/detail/internal/scalar/for_each.h
new file mode 100644
index 0000000..4e31d91
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/for_each.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Sequential implementations of for_each functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator,
+         typename UnaryFunction>
+InputIterator for_each(InputIterator first,
+                       InputIterator last,
+                       UnaryFunction f)
+{
+  // wrap f
+  thrust::detail::host_function<
+    UnaryFunction,
+    void
+  > wrapped_f(f);
+
+  for(; first != last; ++first)
+  {
+    wrapped_f(*first);
+  }
+
+  return first;
+} // end for_each()
+
+template<typename InputIterator,
+         typename Size,
+         typename UnaryFunction>
+InputIterator for_each_n(InputIterator first,
+                         Size n,
+                         UnaryFunction f)
+{
+  // wrap f
+  thrust::detail::host_function<
+    UnaryFunction,
+    void
+  > wrapped_f(f);
+
+  for(Size i = 0; i != n; i++)
+  {
+    // we can dereference an OutputIterator if f does not
+    // try to use the reference for anything besides assignment
+    wrapped_f(*first);
+    ++first;
+  }
+
+  return first;
+} // end for_each_n()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/general_copy.h b/compat/thrust/system/detail/internal/scalar/general_copy.h
new file mode 100644
index 0000000..aae061d
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/general_copy.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file general_copy.h
+ *  \brief Sequential copy algorithms for general iterators.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator,
+         typename OutputIterator>
+  OutputIterator general_copy(InputIterator first,
+                              InputIterator last,
+                              OutputIterator result)
+{
+  for(; first != last; ++first, ++result)
+    *result = *first;
+  return result;
+} // end general_copy()
+
+
+template<typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator general_copy_n(InputIterator first,
+                                Size n,
+                                OutputIterator result)
+{
+  for(; n > Size(0); ++first, ++result, --n)
+    *result = *first;
+  return result;
+} // end general_copy_n()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/insertion_sort.h b/compat/thrust/system/detail/internal/scalar/insertion_sort.h
new file mode 100644
index 0000000..5949ce7
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/insertion_sort.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/detail/internal/scalar/copy_backward.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename RandomAccessIterator,
+          typename StrictWeakOrdering>
+void insertion_sort(RandomAccessIterator first,
+                    RandomAccessIterator last,
+                    StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  if (first == last) return;
+
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  for(RandomAccessIterator i = first + 1; i != last; ++i)
+  {
+    value_type tmp = *i;
+
+    if (wrapped_comp(tmp, *first))
+    {
+      // tmp is the smallest value encountered so far
+      thrust::system::detail::internal::scalar::copy_backward(first, i, i + 1);
+
+      *first = tmp;
+    }
+    else
+    {
+      // tmp is not the smallest value, can avoid checking for j == first
+      RandomAccessIterator j = i;
+      RandomAccessIterator k = i - 1;
+
+      while(wrapped_comp(tmp, *k))
+      {
+        *j = *k;
+        j = k;
+        --k;
+      }
+
+      *j = tmp;
+    }
+  }
+}
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename StrictWeakOrdering>
+void insertion_sort_by_key(RandomAccessIterator1 first1,
+                           RandomAccessIterator1 last1,
+                           RandomAccessIterator2 first2,
+                           StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  if (first1 == last1) return;
+
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  RandomAccessIterator1 i1 = first1 + 1;
+  RandomAccessIterator2 i2 = first2 + 1;
+
+  for(; i1 != last1; ++i1, ++i2)
+  {
+    value_type1 tmp1 = *i1;
+    value_type2 tmp2 = *i2;
+
+    if (wrapped_comp(tmp1, *first1))
+    {
+      // tmp is the smallest value encountered so far
+      thrust::system::detail::internal::scalar::copy_backward(first1, i1, i1 + 1);
+      thrust::system::detail::internal::scalar::copy_backward(first2, i2, i2 + 1);
+
+      *first1 = tmp1;
+      *first2 = tmp2;
+    }
+    else
+    {
+      // tmp is not the smallest value, can avoid checking for j == first
+      RandomAccessIterator1 j1 = i1;
+      RandomAccessIterator1 k1 = i1 - 1;
+
+      RandomAccessIterator2 j2 = i2;
+      RandomAccessIterator2 k2 = i2 - 1;
+
+      while(wrapped_comp(tmp1, *k1))
+      {
+        *j1 = *k1;
+        *j2 = *k2;
+
+        j1 = k1;
+        j2 = k2;
+
+        --k1;
+        --k2;
+      }
+
+      *j1 = tmp1;
+      *j2 = tmp2;
+    }
+  }
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/merge.h b/compat/thrust/system/detail/internal/scalar/merge.h
new file mode 100644
index 0000000..c02fca4
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/merge.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file merge.h
+ *  \brief Sequential implementation of merge algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp);
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first1,
+               InputIterator4 values_first2,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp);
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/merge.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/merge.inl b/compat/thrust/system/detail/internal/scalar/merge.inl
new file mode 100644
index 0000000..a7c2a39
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/merge.inl
@@ -0,0 +1,145 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/detail/internal/scalar/copy.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first2, *first1))
+    {
+      *result = *first2;
+      ++first2;
+    } // end if
+    else
+    {
+      *result = *first1;
+      ++first1;
+    } // end else
+
+    ++result;
+  } // end while
+
+  return thrust::system::detail::internal::scalar::copy(first2, last2, thrust::system::detail::internal::scalar::copy(first1, last1, result));
+} // end merge()
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first1,
+               InputIterator4 values_first2,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(keys_first1 != keys_last1 && keys_first2 != keys_last2)
+  {
+    if(!wrapped_comp(*keys_first2, *keys_first1))
+    {
+      // *keys_first1 <= *keys_first2
+      *keys_result   = *keys_first1;
+      *values_result = *values_first1;
+      ++keys_first1;
+      ++values_first1;
+    }
+    else
+    {
+      // *keys_first1 > keys_first2
+      *keys_result   = *keys_first2;
+      *values_result = *values_first2;
+      ++keys_first2;
+      ++values_first2;
+    }
+
+    ++keys_result;
+    ++values_result;
+  }
+
+  while(keys_first1 != keys_last1)
+  {
+    *keys_result   = *keys_first1;
+    *values_result = *values_first1;
+    ++keys_first1;
+    ++values_first1;
+    ++keys_result;
+    ++values_result;
+  }
+
+  while(keys_first2 != keys_last2)
+  {
+    *keys_result   = *keys_first2;
+    *values_result = *values_first2;
+    ++keys_first2;
+    ++values_first2;
+    ++keys_result;
+    ++values_result;
+  }
+
+  return thrust::make_pair(keys_result, values_result);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/partition.h b/compat/thrust/system/detail/internal/scalar/partition.h
new file mode 100644
index 0000000..7ba677e
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/partition.h
@@ -0,0 +1,262 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file partition.h
+ *  \brief Sequential implementations of partition functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename ForwardIterator1,
+          typename ForwardIterator2>
+void iter_swap(ForwardIterator1 iter1, ForwardIterator2 iter2)
+{
+  // XXX this isn't correct because it doesn't use thrust::swap
+  using namespace thrust::detail;
+
+  typedef typename thrust::iterator_value<ForwardIterator1>::type T;
+
+  T temp = *iter1;
+  *iter1 = *iter2;
+  *iter2 = temp;
+}
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator partition(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  if (first == last)
+    return first;
+
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while (wrapped_pred(*first))
+  {
+    if (++first == last)
+      return first;
+  }
+
+  ForwardIterator next = first;
+
+  while (++next != last)
+  {
+    if (wrapped_pred(*next))
+    {
+      iter_swap(first, next);
+      ++first;
+    }
+  }
+
+  return first;
+}
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // XXX the type of exec should be:
+  //     typedef decltype(select_system(first, last)) system;
+  typedef typename thrust::iterator_system<ForwardIterator>::type ExecutionPolicy;
+  typedef typename thrust::iterator_value<ForwardIterator>::type T;
+
+  typedef thrust::detail::temporary_array<T,ExecutionPolicy> TempRange;
+  typedef typename TempRange::iterator                       TempIterator;
+
+  // XXX presumes ExecutionPolicy is default constructible
+  ExecutionPolicy exec;
+  TempRange temp(exec, first, last);
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
+  {
+    if (wrapped_pred(*iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  ForwardIterator middle = first;
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
+  {
+    if (!wrapped_pred(*iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  return middle;
+}
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // XXX the type of exec should be:
+  //     typedef decltype(select_system(first, stencil)) system;
+  typedef typename thrust::iterator_system<ForwardIterator>::type ExecutionPolicy;
+  typedef typename thrust::iterator_value<ForwardIterator>::type T;
+
+  typedef thrust::detail::temporary_array<T,ExecutionPolicy> TempRange;
+  typedef typename TempRange::iterator                       TempIterator;
+
+  // XXX presumes ExecutionPolicy is default constructible
+  ExecutionPolicy exec;
+  TempRange temp(exec, first, last);
+
+  InputIterator stencil_iter = stencil;
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  {
+    if (wrapped_pred(*stencil_iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  ForwardIterator middle = first;
+  stencil_iter = stencil;
+
+  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
+  {
+    if (!wrapped_pred(*stencil_iter))
+    {
+      *first = *iter;
+      ++first;
+    }
+  }
+
+  return middle;
+}
+
+template<typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  for(; first != last; ++first)
+  {
+    if(wrapped_pred(*first))
+    {
+      *out_true = *first;
+      ++out_true;
+    } // end if
+    else
+    {
+      *out_false = *first;
+      ++out_false;
+    } // end else
+  }
+
+  return thrust::make_pair(out_true, out_false);
+}
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  for(; first != last; ++first, ++stencil)
+  {
+    if(wrapped_pred(*stencil))
+    {
+      *out_true = *first;
+      ++out_true;
+    } // end if
+    else
+    {
+      *out_false = *first;
+      ++out_false;
+    } // end else
+  }
+
+  return thrust::make_pair(out_true, out_false);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/reduce.h b/compat/thrust/system/detail/internal/scalar/reduce.h
new file mode 100644
index 0000000..7ad430e
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/reduce.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief Sequential implementation of reduce algorithm.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  // wrap binary_op
+  thrust::detail::host_function<
+    BinaryFunction,
+    OutputType
+  > wrapped_binary_op(binary_op);
+
+  // initialize the result
+  OutputType result = init;
+
+  while(begin != end)
+  {
+    result = wrapped_binary_op(result, *begin);
+    ++begin;
+  } // end while
+
+  return result;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/reduce_by_key.h b/compat/thrust/system/detail/internal/scalar/reduce_by_key.h
new file mode 100644
index 0000000..eeacb9d
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/reduce_by_key.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/pair.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
+  typedef typename thrust::iterator_traits<InputIterator2>::value_type  InputValueType;
+
+  typedef typename thrust::detail::intermediate_type_from_function_and_iterators<
+    InputIterator2,
+    OutputIterator2,
+    BinaryFunction
+  >::type TemporaryType;
+
+  if(keys_first != keys_last)
+  {
+    InputKeyType  temp_key   = *keys_first;
+    TemporaryType temp_value = *values_first;
+
+    for(++keys_first, ++values_first;
+        keys_first != keys_last;
+        ++keys_first, ++values_first)
+    {
+      InputKeyType    key  = *keys_first;
+      InputValueType value = *values_first;
+
+      if (binary_pred(temp_key, key))
+      {
+        temp_value = binary_op(temp_value, value);
+      }
+      else
+      {
+        *keys_output   = temp_key;
+        *values_output = temp_value;
+
+        ++keys_output;
+        ++values_output;
+
+        temp_key   = key;
+        temp_value = value;
+      }
+    }
+
+    *keys_output   = temp_key;
+    *values_output = temp_value;
+
+    ++keys_output;
+    ++values_output;
+  }
+
+  return thrust::make_pair(keys_output, values_output);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/remove.h b/compat/thrust/system/detail/internal/scalar/remove.h
new file mode 100644
index 0000000..2360019
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/remove.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file remove.h
+ *  \brief Sequential implementations of remove functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // advance iterators until wrapped_pred(*first) is true or we reach the end of input
+  while(first != last && !wrapped_pred(*first))
+    ++first;
+
+  if(first == last)
+    return first;
+
+  // result always trails first 
+  ForwardIterator result = first;
+
+  ++first;
+
+  while(first != last)
+  {
+    if(!wrapped_pred(*first))
+    {
+      *result = *first;
+      ++result;
+    }
+    ++first;
+  }
+
+  return result;
+}
+
+
+template<typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  // advance iterators until wrapped_pred(*stencil) is true or we reach the end of input
+  while(first != last && !wrapped_pred(*stencil))
+  {
+    ++first;
+    ++stencil;
+  }
+
+  if(first == last)
+    return first;
+
+  // result always trails first 
+  ForwardIterator result = first;
+
+  ++first;
+  ++stencil;
+
+  while(first != last)
+  {
+    if(!wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    }
+    ++first;
+    ++stencil;
+  }
+
+  return result;
+}
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while (first != last)
+  {
+    if (!wrapped_pred(*first))
+    {
+      *result = *first;
+      ++result;
+    }
+
+    ++first;
+  }
+
+  return result;
+}
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // wrap pred
+  thrust::detail::host_function<
+    Predicate,
+    bool
+  > wrapped_pred(pred);
+
+  while (first != last)
+  {
+    if (!wrapped_pred(*stencil))
+    {
+      *result = *first;
+      ++result;
+    }
+
+    ++first;
+    ++stencil;
+  }
+
+  return result;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/scan.h b/compat/thrust/system/detail/internal/scalar/scan.h
new file mode 100644
index 0000000..8f41150
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/scan.h
@@ -0,0 +1,153 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief Sequential implementations of scan functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+  
+  using namespace thrust::detail;
+
+  typedef typename eval_if<
+    has_result_type<BinaryFunction>::value,
+    result_type<BinaryFunction>,
+    eval_if<
+      is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  // wrap binary_op
+  thrust::detail::host_function<
+    BinaryFunction,
+    ValueType
+  > wrapped_binary_op(binary_op);
+
+  if(first != last)
+  {
+    ValueType sum = *first;
+
+    *result = sum;
+
+    for(++first, ++result; first != last; ++first, ++result)
+      *result = sum = wrapped_binary_op(sum,*first);
+  }
+
+  return result;
+}
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+
+  using namespace thrust::detail;
+
+  typedef typename eval_if<
+    has_result_type<BinaryFunction>::value,
+    result_type<BinaryFunction>,
+    eval_if<
+      is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  if(first != last)
+  {
+    ValueType tmp = *first;  // temporary value allows in-situ scan
+    ValueType sum = init;
+
+    *result = sum;
+    sum = binary_op(sum, tmp);
+
+    for(++first, ++result; first != last; ++first, ++result)
+    {
+      tmp = *first;
+      *result = sum;
+      sum = binary_op(sum, tmp);
+    }
+  }
+
+  return result;
+} 
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/scan_by_key.h b/compat/thrust/system/detail/internal/scalar/scan_by_key.h
new file mode 100644
index 0000000..a31fc60
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/scan_by_key.h
@@ -0,0 +1,147 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan_by_key.h
+ *  \brief Sequential implementation of scan_by_key functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+
+  // wrap binary_op
+  thrust::detail::host_function<
+    BinaryFunction,
+    ValueType
+  > wrapped_binary_op(binary_op);
+
+  if(first1 != last1)
+  {
+    KeyType   prev_key   = *first1;
+    ValueType prev_value = *first2;
+
+    *result = prev_value;
+
+    for(++first1, ++first2, ++result;
+        first1 != last1;
+        ++first1, ++first2, ++result)
+    {
+      KeyType key = *first1;
+
+      if (binary_pred(prev_key, key))
+        *result = prev_value = wrapped_binary_op(prev_value,*first2);
+      else
+        *result = prev_value = *first2;
+
+      prev_key = key;
+    }
+  }
+
+  return result;
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename T,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
+                                       InputIterator1 last1,
+                                       InputIterator2 first2,
+                                       OutputIterator result,
+                                       T init,
+                                       BinaryPredicate binary_pred,
+                                       BinaryFunction binary_op)
+{
+  using namespace thrust::detail;
+
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
+  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+
+  if(first1 != last1)
+  {
+    KeyType   temp_key   = *first1;
+    ValueType temp_value = *first2;
+
+    ValueType next = init;
+
+    // first one is init
+    *result = next;
+
+    next = binary_op(next, temp_value);
+
+    for(++first1, ++first2, ++result;
+        first1 != last1;
+        ++first1, ++first2, ++result)
+    {
+      KeyType key = *first1;
+
+      // use temp to permit in-place scans
+      temp_value = *first2;
+
+      if (!binary_pred(temp_key, key))
+        next = init;  // reset sum
+
+      *result = next;  
+      next = binary_op(next, temp_value);
+
+      temp_key = key;
+    }
+  }
+
+  return result;
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/set_operations.h b/compat/thrust/system/detail/internal/scalar/set_operations.h
new file mode 100644
index 0000000..f85b510
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/set_operations.h
@@ -0,0 +1,208 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file set_operations.h
+ *  \brief Sequential implementation of set operation functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/internal/scalar/copy.h>
+#include <thrust/detail/function.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_difference(InputIterator1 first1,
+                                InputIterator1 last1,
+                                InputIterator2 first2,
+                                InputIterator2 last2,
+                                OutputIterator result,
+                                StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      ++first2;
+    } // end else if
+    else
+    {
+      ++first1;
+      ++first2;
+    } // end else
+  } // end while
+
+  return scalar::copy(first1, last1, result);
+} // end set_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_intersection(InputIterator1 first1,
+                                  InputIterator1 last1,
+                                  InputIterator2 first2,
+                                  InputIterator2 last2,
+                                  OutputIterator result,
+                                  StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      ++first1;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      ++first2;
+    } // end else if
+    else
+    {
+      *result = *first1;
+      ++first1;
+      ++first2;
+      ++result;
+    } // end else
+  } // end while
+
+  return result;
+} // end set_intersection()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_symmetric_difference(InputIterator1 first1,
+                                          InputIterator1 last1,
+                                          InputIterator2 first2,
+                                          InputIterator2 last2,
+                                          OutputIterator result,
+                                          StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+      ++result;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      *result = *first2;
+      ++first2;
+      ++result;
+    } // end else if
+    else
+    {
+      ++first1;
+      ++first2;
+    } // end else
+  } // end while
+
+  return scalar::copy(first2, last2, scalar::copy(first1, last1, result));
+} // end set_symmetric_difference()
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+  OutputIterator set_union(InputIterator1 first1,
+                           InputIterator1 last1,
+                           InputIterator2 first2,
+                           InputIterator2 last2,
+                           OutputIterator result,
+                           StrictWeakOrdering comp)
+{
+  // wrap comp
+  thrust::detail::host_function<
+    StrictWeakOrdering,
+    bool
+  > wrapped_comp(comp);
+
+  while(first1 != last1 && first2 != last2)
+  {
+    if(wrapped_comp(*first1,*first2))
+    {
+      *result = *first1;
+      ++first1;
+    } // end if
+    else if(wrapped_comp(*first2,*first1))
+    {
+      *result = *first2;
+      ++first2;
+    } // end else if
+    else
+    {
+      *result = *first1;
+      ++first1;
+      ++first2;
+    } // end else
+
+    ++result;
+  } // end while
+
+  return scalar::copy(first2, last2, scalar::copy(first1, last1, result));
+} // end set_union()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/sort.h b/compat/thrust/system/detail/internal/scalar/sort.h
new file mode 100644
index 0000000..9e465c8
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/sort.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file sort.h
+ *  \brief Sequential implementations of sort algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp);
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp);
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/sort.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/sort.inl b/compat/thrust/system/detail/internal/scalar/sort.inl
new file mode 100644
index 0000000..c6ed273
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/sort.inl
@@ -0,0 +1,161 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/reverse.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/internal/scalar/stable_merge_sort.h>
+#include <thrust/system/detail/internal/scalar/stable_primitive_sort.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+namespace sort_detail
+{
+
+////////////////////
+// Primitive Sort //
+////////////////////
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp,
+                 thrust::detail::true_type)
+{
+  thrust::system::detail::internal::scalar::stable_primitive_sort(first, last);
+        
+  // if comp is greater<T> then reverse the keys
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+
+  if (reverse)
+    thrust::reverse(first, last);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp,
+                        thrust::detail::true_type)
+{
+  // if comp is greater<T> then reverse the keys and values
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
+
+  // note, we also have to reverse the (unordered) input to preserve stability
+  if (reverse)
+  {
+    thrust::reverse(first1,  last1);
+    thrust::reverse(first2, first2 + (last1 - first1));
+  }
+
+  thrust::system::detail::internal::scalar::stable_primitive_sort_by_key(first1, last1, first2);
+
+  if (reverse)
+  {
+    thrust::reverse(first1,  last1);
+    thrust::reverse(first2, first2 + (last1 - first1));
+  }
+}
+
+////////////////
+// Merge Sort //
+////////////////
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp,
+                 thrust::detail::false_type)
+{
+  thrust::system::detail::internal::scalar::stable_merge_sort(first, last, comp);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp,
+                        thrust::detail::false_type)
+{
+  thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, last1, first2, comp);
+}
+
+
+} // end namespace sort_detail
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
+  static const bool use_primitive_sort = thrust::detail::is_arithmetic<KeyType>::value &&
+                                         (thrust::detail::is_same<StrictWeakOrdering, typename thrust::less<KeyType> >::value ||
+                                          thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value);
+
+  // supress unused variable warning
+  (void) use_primitive_sort;
+
+  thrust::system::detail::internal::scalar::sort_detail::stable_sort
+    (first, last, comp, 
+      thrust::detail::integral_constant<bool, use_primitive_sort>());
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(RandomAccessIterator1 first1,
+                        RandomAccessIterator1 last1,
+                        RandomAccessIterator2 first2,
+                        StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
+  static const bool use_primitive_sort = thrust::detail::is_arithmetic<KeyType>::value &&
+                                         (thrust::detail::is_same<StrictWeakOrdering, typename thrust::less<KeyType> >::value ||
+                                          thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value);
+
+  // supress unused variable warning
+  (void) use_primitive_sort;
+
+  thrust::system::detail::internal::scalar::sort_detail::stable_sort_by_key
+    (first1, last1, first2, comp, 
+      thrust::detail::integral_constant<bool, use_primitive_sort>());
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h
new file mode 100644
index 0000000..f68242c
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file stable_merge_sort.h
+ *  \brief Sequential implementation of merge sort.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace cpp
+{
+namespace detail
+{
+
+template<typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_merge_sort(RandomAccessIterator begin,
+                       RandomAccessIterator end,
+                       StrictWeakOrdering comp);
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_merge_sort_by_key(RandomAccessIterator1 keys_begin,
+                              RandomAccessIterator1 keys_end,
+                              RandomAccessIterator2 values_begin,
+                              StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace cpp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/stable_merge_sort.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl
new file mode 100644
index 0000000..41d320c
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl
@@ -0,0 +1,150 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/detail/internal/scalar/merge.h>
+#include <thrust/system/detail/internal/scalar/insertion_sort.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+namespace detail
+{
+
+template <typename RandomAccessIterator,
+          typename StrictWeakOrdering>
+void inplace_merge(RandomAccessIterator first,
+                   RandomAccessIterator middle,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  // XXX the type of exec should be:
+  //     typedef decltype(select_system(first, middle, last)) DerivedPolicy;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type DerivedPolicy;
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  // XXX assumes DerivedPolicy is default constructible
+  // XXX find a way to get a stateful execution policy into this function
+  //     or simply pass scratch space
+  DerivedPolicy exec;
+  thrust::detail::temporary_array<value_type, DerivedPolicy> a(exec, first, middle);
+  thrust::detail::temporary_array<value_type, DerivedPolicy> b(exec, middle, last);
+
+  thrust::system::detail::internal::scalar::merge(a.begin(), a.end(), b.begin(), b.end(), first, comp);
+}
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename StrictWeakOrdering>
+void inplace_merge_by_key(RandomAccessIterator1 first1,
+                          RandomAccessIterator1 middle1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  // XXX the type of exec should be:
+  //     typedef decltype(select_system(first1, middle1, last1, first2)) DerivedPolicy;
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type DerivedPolicy;
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
+  RandomAccessIterator2 last2   = first2 + (last1   - first1);
+
+  // XXX assumes DerivedPolicy is default constructible
+  // XXX find a way to get a stateful exec into this function
+  //     or simply pass scratch space
+  DerivedPolicy exec;
+  thrust::detail::temporary_array<value_type1, DerivedPolicy> lhs1(exec, first1, middle1);
+  thrust::detail::temporary_array<value_type1, DerivedPolicy> rhs1(exec, middle1, last1);
+  thrust::detail::temporary_array<value_type2, DerivedPolicy> lhs2(exec, first2, middle2);
+  thrust::detail::temporary_array<value_type2, DerivedPolicy> rhs2(exec, middle2, last2);
+
+  thrust::system::detail::internal::scalar::merge_by_key
+    (lhs1.begin(), lhs1.end(), rhs1.begin(), rhs1.end(),
+     lhs2.begin(), rhs2.begin(),
+     first1, first2, comp);
+}
+
+} // end namespace detail
+
+//////////////
+// Key Sort //
+//////////////
+
+template <typename RandomAccessIterator,
+          typename StrictWeakOrdering>
+void stable_merge_sort(RandomAccessIterator first,
+                       RandomAccessIterator last,
+                       StrictWeakOrdering comp)
+{
+  if (last - first < 32)
+  {
+    thrust::system::detail::internal::scalar::insertion_sort(first, last, comp);
+  }
+  else
+  {
+    RandomAccessIterator middle = first + (last - first) / 2;
+
+    thrust::system::detail::internal::scalar::stable_merge_sort(first, middle, comp);
+    thrust::system::detail::internal::scalar::stable_merge_sort(middle,  last, comp);
+    detail::inplace_merge(first, middle, last, comp);
+  }
+}
+
+
+////////////////////
+// Key-Value Sort //
+////////////////////
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename StrictWeakOrdering>
+void stable_merge_sort_by_key(RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2,
+                              StrictWeakOrdering comp)
+{
+  if (last1 - first1 <= 32)
+  {
+    thrust::system::detail::internal::scalar::insertion_sort_by_key(first1, last1, first2, comp);
+  }
+  else
+  {
+    RandomAccessIterator1 middle1 = first1 + (last1 - first1) / 2;
+    RandomAccessIterator2 middle2 = first2 + (last1 - first1) / 2;
+
+    thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, middle1, first2,  comp);
+    thrust::system::detail::internal::scalar::stable_merge_sort_by_key(middle1,  last1, middle2, comp);
+    detail::inplace_merge_by_key(first1, middle1, last1, first2, comp);
+  }
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h
new file mode 100644
index 0000000..f37bf27
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename RandomAccessIterator>
+void stable_primitive_sort(RandomAccessIterator first,
+                           RandomAccessIterator last);
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_primitive_sort_by_key(RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first);
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/stable_primitive_sort.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl
new file mode 100644
index 0000000..c22b15c
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/internal/scalar/stable_primitive_sort.h>
+#include <thrust/system/detail/internal/scalar/stable_radix_sort.h>
+#include <thrust/functional.h>
+#include <thrust/system/detail/internal/scalar/partition.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+namespace stable_primitive_sort_detail
+{
+
+
+template<typename Iterator>
+  struct enable_if_bool_sort
+    : thrust::detail::enable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+template<typename Iterator>
+  struct disable_if_bool_sort
+    : thrust::detail::disable_if<
+        thrust::detail::is_same<
+          bool,
+          typename thrust::iterator_value<Iterator>::type
+        >::value
+      >
+{};
+
+
+
+template<typename RandomAccessIterator>
+  typename enable_if_bool_sort<RandomAccessIterator>::type
+    stable_primitive_sort(RandomAccessIterator first, RandomAccessIterator last)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  scalar::stable_partition(first, last, thrust::logical_not<bool>());
+}
+
+
+template<typename RandomAccessIterator>
+  typename disable_if_bool_sort<RandomAccessIterator>::type
+    stable_primitive_sort(RandomAccessIterator first, RandomAccessIterator last)
+{
+  // call stable_radix_sort
+  scalar::stable_radix_sort(first,last);
+}
+
+
+struct logical_not_first
+{
+  template<typename Tuple>
+  __host__ __device__
+  bool operator()(Tuple t)
+  {
+    return !thrust::get<0>(t);
+  }
+};
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2>
+  typename enable_if_bool_sort<RandomAccessIterator1>::type
+    stable_primitive_sort_by_key(RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // use stable_partition if we're sorting bool
+  // stable_partition puts true values first, so we need to logical_not
+  scalar::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
+                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
+                           logical_not_first());
+}
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2>
+  typename disable_if_bool_sort<RandomAccessIterator1>::type
+    stable_primitive_sort_by_key(RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
+                                 RandomAccessIterator2 values_first)
+{
+  // call stable_radix_sort_by_key
+  scalar::stable_radix_sort_by_key(keys_first, keys_last, values_first);
+}
+
+
+}
+
+template<typename RandomAccessIterator>
+void stable_primitive_sort(RandomAccessIterator first,
+                           RandomAccessIterator last)
+{
+  scalar::stable_primitive_sort_detail::stable_primitive_sort(first,last);
+}
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_primitive_sort_by_key(RandomAccessIterator1 keys_first,
+                                  RandomAccessIterator1 keys_last,
+                                  RandomAccessIterator2 values_first)
+{
+  scalar::stable_primitive_sort_detail::stable_primitive_sort_by_key(keys_first, keys_last, values_first);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/stable_primitive_sort.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h
new file mode 100644
index 0000000..f2af222
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file stable_radix_sort.h
+ *  \brief Sequential implementation of radix sort.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename RandomAccessIterator>
+void stable_radix_sort(RandomAccessIterator begin,
+                       RandomAccessIterator end);
+
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2>
+void stable_radix_sort_by_key(RandomAccessIterator1 keys_begin,
+                              RandomAccessIterator1 keys_end,
+                              RandomAccessIterator2 values_begin);
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/detail/internal/scalar/stable_radix_sort.inl>
+
diff --git a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl
new file mode 100644
index 0000000..98846ab
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl
@@ -0,0 +1,434 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <limits>
+
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/cstdint.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+namespace detail
+{
+
+template <typename T>
+struct RadixEncoder : public thrust::identity<T>
+{};
+
+template <>
+struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
+{
+  unsigned char operator()(char x) const
+  {
+    if(std::numeric_limits<char>::is_signed)
+      return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+    else
+      return x;
+  }
+};
+
+template <>
+struct RadixEncoder<signed char> : public thrust::unary_function<signed char, unsigned char>
+{
+  unsigned char operator()(signed char x) const
+  {
+    return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short>
+{
+  unsigned short operator()(short x) const
+  {
+    return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<int> : public thrust::unary_function<int, unsigned int>
+{
+  unsigned long operator()(long x) const
+  {
+    return x ^ static_cast<unsigned int>(1) << (8 * sizeof(unsigned int) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<long> : public thrust::unary_function<long, unsigned long>
+{
+  unsigned long operator()(long x) const
+  {
+    return x ^ static_cast<unsigned long>(1) << (8 * sizeof(unsigned long) - 1);
+  }
+};
+
+template <>
+struct RadixEncoder<long long> : public thrust::unary_function<long long, unsigned long long>
+{
+  unsigned long long operator()(long long x) const
+  {
+    return x ^ static_cast<unsigned long long>(1) << (8 * sizeof(unsigned long long) - 1);
+  }
+};
+
+// ideally we'd use uint32 here and uint64 below
+template <>
+struct RadixEncoder<float> : public thrust::unary_function<float, thrust::detail::uint32_t>
+{
+  thrust::detail::uint32_t operator()(float x) const
+  {
+    union { float f; thrust::detail::uint32_t i; } u;
+    u.f = x;
+    thrust::detail::uint32_t mask = -static_cast<thrust::detail::int32_t>(u.i >> 31) | (static_cast<thrust::detail::uint32_t>(1) << 31);
+    return u.i ^ mask;
+  }
+};
+
+template <>
+struct RadixEncoder<double> : public thrust::unary_function<double, thrust::detail::uint64_t>
+{
+  thrust::detail::uint64_t operator()(double x) const
+  {
+    union { double f; thrust::detail::uint64_t i; } u;
+    u.f = x;
+    thrust::detail::uint64_t mask = -static_cast<thrust::detail::int64_t>(u.i >> 63) | (static_cast<thrust::detail::uint64_t>(1) << 63);
+    return u.i ^ mask;
+  }
+};
+
+
+template <unsigned int RadixBits,
+          bool HasValues,
+          typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename RandomAccessIterator3,
+          typename RandomAccessIterator4>
+void radix_sort(RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                RandomAccessIterator3 vals1,
+                RandomAccessIterator4 vals2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+
+  typedef RadixEncoder<KeyType> Encoder;
+  typedef typename Encoder::result_type EncodedType;
+
+  static const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
+  static const unsigned int HistogramSize =  1 << RadixBits;
+
+  static const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
+  
+  Encoder encode;
+
+  // storage for histograms
+  size_t histograms[NumHistograms][HistogramSize] = {{0}};
+
+  // see which passes can be eliminated
+  bool skip_shuffle[NumHistograms] = {false};
+  
+  // false if most recent data is stored in (keys1,vals1)
+  bool flip = false;
+    
+  // compute histograms
+  for (size_t i = 0; i < N; i++)
+  {
+    const EncodedType x = encode(keys1[i]);
+
+    for (unsigned int j = 0; j < NumHistograms; j++)
+    {
+      const EncodedType BitShift = RadixBits * j;
+      histograms[j][(x >> BitShift) & BitMask]++;
+    }
+  }
+
+  // scan histograms
+  for (unsigned int i = 0; i < NumHistograms; i++)
+  {
+    size_t sum = 0;
+
+    for (unsigned int j = 0; j < HistogramSize; j++)
+    {
+      size_t bin = histograms[i][j];
+
+      if (bin == N)
+        skip_shuffle[i] = true;
+
+      histograms[i][j] = sum;
+
+      sum = sum + bin;
+    }
+  }
+
+  // shuffle keys and (optionally) values 
+  for (unsigned int i = 0; i < NumHistograms; i++)
+  {
+    const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
+
+    if (!skip_shuffle[i])
+    {
+      if (flip)
+      {
+        for (size_t j = 0; j < N; j++)
+        {
+          const EncodedType x = encode(keys2[j]);
+          size_t position = histograms[i][(x >> BitShift) & BitMask]++;
+
+          RandomAccessIterator1 temp_keys1 = keys1;
+          temp_keys1 += position;
+
+          RandomAccessIterator2 temp_keys2 = keys2;
+          temp_keys2 += j;
+
+          // keys1[position] = keys2[j]
+          *temp_keys1 = *temp_keys2;
+
+          if (HasValues)
+          {
+            RandomAccessIterator3 temp_vals1 = vals1;
+            temp_vals1 += position;
+
+            RandomAccessIterator4 temp_vals2 = vals2;
+            temp_vals2 += j;
+
+            // vals1[position] = vals2[j]
+            *temp_vals1 = *temp_vals2;
+          }
+        }
+      }
+      else
+      {
+        for (size_t j = 0; j < N; j++)
+        {
+          const EncodedType x = encode(keys1[j]);
+          size_t position = histograms[i][(x >> BitShift) & BitMask]++;
+
+          RandomAccessIterator1 temp_keys1 = keys1;
+          temp_keys1 += j;
+
+          RandomAccessIterator2 temp_keys2 = keys2;
+          temp_keys2 += position;
+
+          // keys2[position] = keys1[j];
+          *temp_keys2 = *temp_keys1;
+
+          if (HasValues)
+          {
+            RandomAccessIterator3 temp_vals1 = vals1;
+            temp_vals1 += j;
+
+            RandomAccessIterator4 temp_vals2 = vals2;
+            temp_vals2 += position;
+
+            // vals2[position] = vals1[j]
+            *temp_vals2 = *temp_vals1;
+          }
+        }
+      }
+        
+      flip = (flip) ? false : true;
+    }
+  }
+ 
+  // ensure final values are in (keys1,vals1)
+  if (flip)
+  {
+    thrust::copy(keys2, keys2 + N, keys1);
+    if (HasValues)
+      thrust::copy(vals2, vals2 + N, vals1);
+  }
+}
+
+
+// Select best radix sort parameters based on sizeof(T) and input size
+// These particular values were determined through empirical testing on a Core i7 950 CPU
+template <size_t KeySize>
+struct radix_sort_dispatcher
+{
+};
+
+template <>
+struct radix_sort_dispatcher<1>
+{
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
+  {
+    detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+  }
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
+  {
+    detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
+  }
+};
+
+template <>
+struct radix_sort_dispatcher<2>
+{
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
+  {
+    if (N < (1 << 16))
+      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    else
+      detail::radix_sort<16,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+  }
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
+  {
+    if (N < (1 << 15))
+      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
+    else
+      detail::radix_sort<16,true>(keys1, keys2, vals1, vals2, N);
+  }
+};
+
+template <>
+struct radix_sort_dispatcher<4>
+{
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
+  {
+    if (N < (1 << 22))
+      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    else
+      detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+  }
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
+  {
+    if (N < (1 << 22))
+      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
+    else
+      detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
+  }
+};
+
+template <>
+struct radix_sort_dispatcher<8>
+{
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
+  {
+    if (N < (1 << 21))
+      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+    else
+      detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
+  }
+  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
+  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
+  {
+    if (N < (1 << 21))
+      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
+    else
+      detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
+  }
+};
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2>
+void radix_sort(RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, N);
+}
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename RandomAccessIterator3,
+          typename RandomAccessIterator4>
+void radix_sort(RandomAccessIterator1 keys1,
+                RandomAccessIterator2 keys2,
+                RandomAccessIterator3 vals1,
+                RandomAccessIterator4 vals2,
+                const size_t N)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, vals1, vals2, N);
+}
+
+} // namespace detail
+
+//////////////
+// Key Sort //
+//////////////
+
+template <typename RandomAccessIterator>
+void stable_radix_sort(RandomAccessIterator first,
+                       RandomAccessIterator last)
+{
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type ExecutionPolicy;
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
+
+  size_t N = last - first;
+  
+  // XXX assumes ExecutionPolicy is default constructible
+  // XXX consider how to get stateful systems into this function
+  ExecutionPolicy exec;
+  thrust::detail::temporary_array<KeyType, ExecutionPolicy> temp(exec, N);
+  
+  detail::radix_sort(first, temp.begin(), N);
+}
+
+
+////////////////////
+// Key-Value Sort //
+////////////////////
+
+template <typename RandomAccessIterator1,
+          typename RandomAccessIterator2>
+void stable_radix_sort_by_key(RandomAccessIterator1 first1,
+                              RandomAccessIterator1 last1,
+                              RandomAccessIterator2 first2)
+{
+  // XXX the type of exec should be
+  //     typedef decltype(select_system(first1,last1,first2)) system;
+  typedef typename thrust::iterator_system<RandomAccessIterator1>::type ExecutionPolicy;
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
+
+  size_t N = last1 - first1;
+  
+  // XXX assumes ExecutionPolicy is default constructible
+  // XXX consider how to get stateful systems into this function
+  ExecutionPolicy exec;
+  thrust::detail::temporary_array<KeyType, ExecutionPolicy>   temp1(exec, N);
+  thrust::detail::temporary_array<ValueType, ExecutionPolicy> temp2(exec, N);
+
+  detail::radix_sort(first1, temp1.begin(), first2, temp2.begin(), N);
+}
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/trivial_copy.h b/compat/thrust/system/detail/internal/scalar/trivial_copy.h
new file mode 100644
index 0000000..8f008b5
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/trivial_copy.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file trivial_copy.h
+ *  \brief Sequential copy algorithms for plain-old-data.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <cstring>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename T>
+  T *trivial_copy_n(const T *first,
+                    std::ptrdiff_t n,
+                    T *result)
+{
+  std::memmove(result, first, n * sizeof(T));
+  return result + n;
+} // end trivial_copy_n()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/unique.h b/compat/thrust/system/detail/internal/scalar/unique.h
new file mode 100644
index 0000000..cfc60c9
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/unique.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.h
+ *  \brief Sequential implementations of unique algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator>::value_type T;
+
+  if(first != last)
+  {
+    T prev = *first;
+
+    for(++first; first != last; ++first)
+    {
+      T temp = *first;
+
+      if (!binary_pred(prev, temp))
+      {
+        *output = prev;
+
+        ++output;
+
+        prev = temp;
+      }
+    }
+
+    *output = prev;
+    ++output;
+  }
+
+  return output;
+} // end unique_copy()
+
+
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // unique_copy() permits in-situ operation
+  return thrust::system::detail::internal::scalar::unique_copy(first, last, first, binary_pred);
+} // end unique()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/internal/scalar/unique_by_key.h b/compat/thrust/system/detail/internal/scalar/unique_by_key.h
new file mode 100644
index 0000000..b0be266
--- /dev/null
+++ b/compat/thrust/system/detail/internal/scalar/unique_by_key.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique_by_key.h
+ *  \brief Sequential implementations of unique_by_key algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace detail
+{
+namespace internal
+{
+namespace scalar
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
+  typedef typename thrust::iterator_traits<OutputIterator2>::value_type OutputValueType;
+
+  if(keys_first != keys_last)
+  {
+    InputKeyType    temp_key   = *keys_first;
+    OutputValueType temp_value = *values_first;
+
+    for(++keys_first, ++values_first;
+        keys_first != keys_last;
+        ++keys_first, ++values_first)
+    {
+      InputKeyType    key   = *keys_first;
+      OutputValueType value = *values_first;
+
+      if(!binary_pred(temp_key, key))
+      {
+        *keys_output   = temp_key;
+        *values_output = temp_value;
+
+        ++keys_output;
+        ++values_output;
+
+        temp_key   = key;
+        temp_value = value;
+      }
+    }
+
+    *keys_output   = temp_key;
+    *values_output = temp_value;
+
+    ++keys_output;
+    ++values_output;
+  }
+
+  return thrust::make_pair(keys_output, values_output);
+} // end unique_by_key_copy()
+
+
+template<typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // unique_by_key_copy() permits in-situ operation
+  return thrust::system::detail::internal::scalar::unique_by_key_copy(keys_first, keys_last, values_first, keys_first, values_first, binary_pred);
+} // end unique_by_key()
+
+} // end namespace scalar
+} // end namespace internal
+} // end namespace detail
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/detail/system_error.inl b/compat/thrust/system/detail/system_error.inl
new file mode 100644
index 0000000..74909be
--- /dev/null
+++ b/compat/thrust/system/detail/system_error.inl
@@ -0,0 +1,111 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/system/system_error.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+system_error
+  ::system_error(error_code ec, const std::string &what_arg)
+    : std::runtime_error(what_arg), m_error_code(ec)
+{
+
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(error_code ec, const char *what_arg)
+    : std::runtime_error(what_arg), m_error_code(ec)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(error_code ec)
+    : std::runtime_error(""), m_error_code(ec)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat, const std::string &what_arg)
+    : std::runtime_error(what_arg), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat, const char *what_arg)
+    : std::runtime_error(what_arg), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+system_error
+  ::system_error(int ev, const error_category &ecat)
+    : std::runtime_error(""), m_error_code(ev,ecat)
+{
+  ;
+} // end system_error::system_error()
+
+
+const error_code &system_error
+  ::code(void) const throw()
+{
+  return m_error_code;
+} // end system_error::code()
+
+
+const char *system_error
+  ::what(void) const throw()
+{
+  if(m_what.empty())
+  {
+    try
+    {
+      m_what = this->std::runtime_error::what();
+      if(m_error_code)
+      {
+        if(!m_what.empty()) m_what += ": ";
+        m_what += m_error_code.message();
+      }
+    }
+    catch(...)
+    {
+      return std::runtime_error::what();
+    }
+  }
+
+  return m_what.c_str();
+} // end system_error::what()
+
+
+} // end system
+
+} // end thrust
+
diff --git a/compat/thrust/system/error_code.h b/compat/thrust/system/error_code.h
new file mode 100644
index 0000000..2b6582c
--- /dev/null
+++ b/compat/thrust/system/error_code.h
@@ -0,0 +1,521 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file error_code.h
+ *  \brief An object used to hold error values, such as those originating from the
+ *         operating system or other low-level application program interfaces.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/errno.h>
+#include <iostream>
+
+namespace thrust
+{
+
+namespace system
+{
+
+
+/*! \addtogroup system_diagnostics
+ *  \{
+ */
+
+class error_condition;
+class error_code;
+
+/*! A metafunction returning whether or not the parameter is an \p error_code enum.
+ */
+template<typename T> struct is_error_code_enum : public thrust::detail::false_type {};
+
+/*! A metafunction returning whether or not the parameter is an \p error_condition enum.
+ */
+template<typename T> struct is_error_condition_enum : public thrust::detail::false_type {};
+
+
+// XXX N3092 prefers enum class errc { ... }
+namespace errc
+{
+
+enum errc_t
+{
+  address_family_not_supported       = detail::eafnosupport,
+  address_in_use                     = detail::eaddrinuse,
+  address_not_available              = detail::eaddrnotavail,
+  already_connected                  = detail::eisconn,
+  argument_list_too_long             = detail::e2big,
+  argument_out_of_domain             = detail::edom,
+  bad_address                        = detail::efault,
+  bad_file_descriptor                = detail::ebadf,
+  bad_message                        = detail::ebadmsg,
+  broken_pipe                        = detail::epipe,
+  connection_aborted                 = detail::econnaborted,
+  connection_already_in_progress     = detail::ealready,
+  connection_refused                 = detail::econnrefused,
+  connection_reset                   = detail::econnreset,
+  cross_device_link                  = detail::exdev,
+  destination_address_required       = detail::edestaddrreq,
+  device_or_resource_busy            = detail::ebusy,
+  directory_not_empty                = detail::enotempty,
+  executable_format_error            = detail::enoexec,
+  file_exists                        = detail::eexist,
+  file_too_large                     = detail::efbig,
+  filename_too_long                  = detail::enametoolong,
+  function_not_supported             = detail::enosys,
+  host_unreachable                   = detail::ehostunreach,
+  identifier_removed                 = detail::eidrm,
+  illegal_byte_sequence              = detail::eilseq,
+  inappropriate_io_control_operation = detail::enotty,
+  interrupted                        = detail::eintr,
+  invalid_argument                   = detail::einval,
+  invalid_seek                       = detail::espipe,
+  io_error                           = detail::eio,
+  is_a_directory                     = detail::eisdir,
+  message_size                       = detail::emsgsize,
+  network_down                       = detail::enetdown,
+  network_reset                      = detail::enetreset,
+  network_unreachable                = detail::enetunreach,
+  no_buffer_space                    = detail::enobufs,
+  no_child_process                   = detail::echild,
+  no_link                            = detail::enolink,
+  no_lock_available                  = detail::enolck,
+  no_message_available               = detail::enodata,
+  no_message                         = detail::enomsg,
+  no_protocol_option                 = detail::enoprotoopt,
+  no_space_on_device                 = detail::enospc,
+  no_stream_resources                = detail::enosr,
+  no_such_device_or_address          = detail::enxio,
+  no_such_device                     = detail::enodev,
+  no_such_file_or_directory          = detail::enoent,
+  no_such_process                    = detail::esrch,
+  not_a_directory                    = detail::enotdir,
+  not_a_socket                       = detail::enotsock,
+  not_a_stream                       = detail::enostr,
+  not_connected                      = detail::enotconn,
+  not_enough_memory                  = detail::enomem,
+  not_supported                      = detail::enotsup,
+  operation_canceled                 = detail::ecanceled,
+  operation_in_progress              = detail::einprogress,
+  operation_not_permitted            = detail::eperm,
+  operation_not_supported            = detail::eopnotsupp,
+  operation_would_block              = detail::ewouldblock,
+  owner_dead                         = detail::eownerdead,
+  permission_denied                  = detail::eacces,
+  protocol_error                     = detail::eproto,
+  protocol_not_supported             = detail::eprotonosupport,
+  read_only_file_system              = detail::erofs,
+  resource_deadlock_would_occur      = detail::edeadlk,
+  resource_unavailable_try_again     = detail::eagain,
+  result_out_of_range                = detail::erange,
+  state_not_recoverable              = detail::enotrecoverable,
+  stream_timeout                     = detail::etime,
+  text_file_busy                     = detail::etxtbsy,
+  timed_out                          = detail::etimedout,
+  too_many_files_open_in_system      = detail::enfile,
+  too_many_files_open                = detail::emfile,
+  too_many_links                     = detail::emlink,
+  too_many_symbolic_link_levels      = detail::eloop,
+  value_too_large                    = detail::eoverflow,
+  wrong_protocol_type                = detail::eprototype
+}; // end errc_t
+
+} // end namespace errc
+
+
+/*! Specialization of \p is_error_condition_enum for \p errc::errc_t
+ */
+template<> struct is_error_condition_enum<errc::errc_t> : public thrust::detail::true_type {};
+
+
+// [19.5.1.1] class error_category
+
+/*! \brief The class \p error_category serves as a base class for types used to identify the
+ *         source and encoding of a particular category of error code. Classes may be derived
+ *         from \p error_category to support categories of errors in addition to those defined
+ *         in the C++ International Standard.
+ */
+class error_category
+{
+  public:
+    /*! Destructor does nothing.
+     */
+    inline virtual ~error_category(void);
+
+    // XXX enable upon c++0x
+    // error_category(const error_category &) = delete;
+    // error_category &operator=(const error_category &) = delete;
+
+    /*! \return A string naming the error category.
+     */
+    inline virtual const char *name(void) const = 0;
+
+    /*! \return \p error_condition(ev, *this).
+     */
+    inline virtual error_condition default_error_condition(int ev) const;
+
+    /*! \return <tt>default_error_condition(code) == condition</tt>
+     */
+    inline virtual bool equivalent(int code, const error_condition &condition) const;
+
+    /*! \return <tt>*this == code.category() && code.value() == condition</tt>
+     */
+    inline virtual bool equivalent(const error_code &code, int condition) const;
+
+    /*! \return A string that describes the error condition denoted by \p ev.
+     */
+    virtual std::string message(int ev) const = 0;
+
+    /*! \return <tt>*this == &rhs</tt>
+     */
+    inline bool operator==(const error_category &rhs) const;
+
+    /*! \return <tt>!(*this == rhs)</tt>
+     */
+    inline bool operator!=(const error_category &rhs) const;
+
+    /*! \return <tt>less<const error_category*>()(this, &rhs)</tt>
+     *  \note \c less provides a total ordering for pointers.
+     */
+    inline bool operator<(const error_category &rhs) const;
+}; // end error_category
+
+
+// [19.5.1.5] error_category objects
+
+
+/*! \return A reference to an object of a type derived from class \p error_category.
+ *  \note The object's \p default_error_condition and \p equivalent virtual functions
+ *        shall behave as specified for the class \p error_category. The object's
+ *        \p name virtual function shall return a pointer to the string <tt>"generic"</tt>.
+ */
+inline const error_category &generic_category(void);
+
+
+/*! \return A reference to an object of a type derived from class \p error_category.
+ *  \note The object's \p equivalent virtual functions shall behave as specified for
+ *        class \p error_category. The object's \p name virtual function shall return
+ *        a pointer to the string <tt>"system"</tt>. The object's \p default_error_condition
+ *        virtual function shall behave as follows:
+ *
+ *        If the argument <tt>ev</tt> corresponds to a POSIX <tt>errno</tt> value
+ *        \c posv, the function shall return <tt>error_condition(ev,generic_category())</tt>.
+ *        Otherwise, the function shall return <tt>error_condition(ev,system_category())</tt>.
+ *        What constitutes correspondence for any given operating system is unspecified.
+ */
+inline const error_category &system_category(void);
+
+
+// [19.5.2] Class error_code
+
+
+/*! \brief The class \p error_code describes an object used to hold error code values, such as
+ *         those originating from the operating system or other low-level application program
+ *         interfaces.
+ */
+class error_code
+{
+  public:
+    // [19.5.2.2] constructors:
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>value() == 0</tt> and <tt>category() == &system_category()</tt>.
+     */
+    inline error_code(void);
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
+     */
+    inline error_code(int val, const error_category &cat);
+
+    /*! Effects: Constructs an object of type \p error_code.
+     *  \post <tt>*this == make_error_code(e)</tt>.
+     */
+    template <typename ErrorCodeEnum>
+      error_code(ErrorCodeEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type * = 0
+#endif // THRUST_HOST_COMPILER_MSVC
+        );
+
+    // [19.5.2.3] modifiers:
+
+    /*! \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
+     */
+    inline void assign(int val, const error_category &cat);
+
+    /*! \post <tt>*this == make_error_code(e)</tt>.
+     */
+    template <typename ErrorCodeEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+      typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
+#else
+      error_code &
+#endif // THRUST_HOST_COMPILER_MSVC
+        operator=(ErrorCodeEnum e);
+
+    /*! \post <tt>value() == 0</tt> and <tt>category() == system_category()</tt>.
+     */
+    inline void clear(void);
+
+    // [19.5.2.4] observers:
+
+    /*! \return An integral value of this \p error_code object.
+     */
+    inline int value(void) const;
+
+    /*! \return An \p error_category describing the category of this \p error_code object.
+     */
+    inline const error_category &category(void) const;
+
+    /*! \return <tt>category().default_error_condition()</tt>.
+     */
+    inline error_condition default_error_condition(void) const;
+
+    /*! \return <tt>category().message(value())</tt>.
+     */
+    inline std::string message(void) const;
+
+    // XXX replace the below upon c++0x
+    // inline explicit operator bool (void) const;
+
+    /*! \return <tt>value() != 0</tt>.
+     */
+    inline operator bool (void) const;
+
+    /*! \cond
+     */
+  private:
+    int m_val;
+    const error_category *m_cat;
+    /*! \endcond
+     */
+}; // end error_code
+
+
+// [19.5.2.5] Class error_code non-member functions
+
+
+// XXX replace errc::errc_t with errc upon c++0x
+/*! \return <tt>error_code(static_cast<int>(e), generic_category())</tt>
+ */
+inline error_code make_error_code(errc::errc_t e);
+
+
+/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
+ */
+inline bool operator<(const error_code &lhs, const error_code &rhs);
+
+
+/*! Effects: <tt>os << ec.category().name() << ':' << ec.value()</tt>.
+ */
+template <typename charT, typename traits>
+  std::basic_ostream<charT,traits>&
+    operator<<(std::basic_ostream<charT,traits>& os, const error_code &ec);
+
+
+// [19.5.3] class error_condition
+
+
+/*! \brief The class \p error_condition describes an object used to hold values identifying
+ *  error conditions.
+ *
+ *  \note \p error_condition values are portable abstractions, while \p error_code values
+ *        are implementation specific.
+ */
+class error_condition
+{
+  public:
+    // [19.5.3.2] constructors
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>value() == 0</tt>.
+     *  \post <tt>category() == generic_category()</tt>.
+     */
+    inline error_condition(void);
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>value() == val</tt>.
+     *  \post <tt>category() == cat</tt>.
+     */
+    inline error_condition(int val, const error_category &cat);
+
+    /*! Constructs an object of type \p error_condition.
+     *  \post <tt>*this == make_error_condition(e)</tt>.
+     *  \note This constructor shall not participate in overload resolution unless
+     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
+     */
+    template<typename ErrorConditionEnum>
+      error_condition(ErrorConditionEnum e
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type * = 0
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+                     );
+
+    // [19.5.3.3] modifiers
+
+    /*! Assigns to this \p error_code object from an error value and an \p error_category.
+     *  \param val The new value to return from <tt>value()</tt>.
+     *  \param cat The new \p error_category to return from <tt>category()</tt>.
+     *  \post <tt>value() == val</tt>.
+     *  \post <tt>category() == cat</tt>.
+     */
+    inline void assign(int val, const error_category &cat);
+
+    /*! Assigns to this \p error_code object from an error condition enumeration.
+     *  \return *this
+     *  \post <tt>*this == make_error_condition(e)</tt>.
+     *  \note This operator shall not participate in overload resolution unless
+     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
+     */
+    template<typename ErrorConditionEnum>
+// XXX WAR msvc's problem with enable_if
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+      typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
+#else
+      error_condition &
+#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+        operator=(ErrorConditionEnum e);
+
+    /*! Clears this \p error_code object.
+     *  \post <tt>value == 0</tt>
+     *  \post <tt>category() == generic_category()</tt>.
+     */
+    inline void clear(void);
+
+    // [19.5.3.4] observers
+
+    /*! \return The value encoded by this \p error_condition.
+     */
+    inline int value(void) const;
+
+    /*! \return A <tt>const</tt> reference to the \p error_category encoded by this \p error_condition.
+     */
+    inline const error_category &category(void) const;
+
+    /*! \return <tt>category().message(value())</tt>.
+     */
+    inline std::string message(void) const;
+
+    // XXX replace below with this upon c++0x
+    //explicit operator bool (void) const;
+    
+    /*! \return <tt>value() != 0</tt>.
+     */
+    inline operator bool (void) const;
+
+    /*! \cond
+     */
+
+  private:
+    int m_val;
+    const error_category *m_cat;
+
+    /*! \endcond
+     */
+}; // end error_condition
+
+
+
+// [19.5.3.5] Class error_condition non-member functions
+
+// XXX replace errc::errc_t with errc upon c++0x
+/*! \return <tt>error_condition(static_cast<int>(e), generic_category())</tt>.
+ */
+inline error_condition make_error_condition(errc::errc_t e);
+
+
+/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
+ */
+inline bool operator<(const error_condition &lhs, const error_condition &rhs);
+
+
+// [19.5.4] Comparison operators
+
+
+/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>.
+ */
+inline bool operator==(const error_code &lhs, const error_code &rhs);
+
+
+/*! \return <tt>lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value())</tt>.
+ */
+inline bool operator==(const error_code &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value())</tt>.
+ */
+inline bool operator==(const error_condition &lhs, const error_code &rhs);
+
+
+/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>
+ */
+inline bool operator==(const error_condition &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_code &lhs, const error_code &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_code &lhs, const error_condition &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_condition &lhs, const error_code &rhs);
+
+
+/*! \return <tt>!(lhs == rhs)</tt>
+ */
+inline bool operator!=(const error_condition &lhs, const error_condition &rhs);
+
+/*! \} // end system_diagnostics
+ */
+
+
+} // end system
+
+
+// import names into thrust::
+using system::error_category;
+using system::error_code;
+using system::error_condition;
+using system::is_error_code_enum;
+using system::is_error_condition_enum;
+using system::make_error_code;
+using system::make_error_condition;
+
+// XXX replace with using system::errc upon c++0x
+namespace errc = system::errc;
+
+using system::generic_category;
+using system::system_category;
+
+} // end thrust
+
+#include <thrust/system/detail/error_category.inl>
+#include <thrust/system/detail/error_code.inl>
+#include <thrust/system/detail/error_condition.inl>
+
diff --git a/compat/thrust/system/omp/detail/adjacent_difference.h b/compat/thrust/system/omp/detail/adjacent_difference.h
new file mode 100644
index 0000000..0bbc188
--- /dev/null
+++ b/compat/thrust/system/omp/detail/adjacent_difference.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     BinaryFunction binary_op)
+{
+  // omp prefers generic::adjacent_difference to cpp::adjacent_difference
+  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/assign_value.h b/compat/thrust/system/omp/detail/assign_value.h
new file mode 100644
index 0000000..eda3b97
--- /dev/null
+++ b/compat/thrust/system/omp/detail/assign_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits assign_value
+#include <thrust/system/cpp/detail/assign_value.h>
+
diff --git a/compat/thrust/system/omp/detail/binary_search.h b/compat/thrust/system/omp/detail/binary_search.h
new file mode 100644
index 0000000..254e6fd
--- /dev/null
+++ b/compat/thrust/system/omp/detail/binary_search.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/binary_search.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+ForwardIterator lower_bound(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    // omp prefers generic::lower_bound to cpp::lower_bound
+    return thrust::system::detail::generic::lower_bound(exec, begin, end, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename Backend>
+ForwardIterator upper_bound(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator begin,
+                            ForwardIterator end,
+                            const T& value, 
+                            StrictWeakOrdering comp)
+{
+    // omp prefers generic::upper_bound to cpp::upper_bound
+    return thrust::system::detail::generic::upper_bound(exec, begin, end, value, comp);
+}
+
+
+template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
+bool binary_search(execution_policy<DerivedPolicy> &exec,
+                   ForwardIterator begin,
+                   ForwardIterator end,
+                   const T& value, 
+                   StrictWeakOrdering comp)
+{
+    // omp prefers generic::binary_search to cpp::binary_search
+    return thrust::system::detail::generic::binary_search(exec, begin, end, value, comp);
+}
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/copy.h b/compat/thrust/system/omp/detail/copy.h
new file mode 100644
index 0000000..b23ac18
--- /dev/null
+++ b/compat/thrust/system/omp/detail/copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/copy.inl>
+
diff --git a/compat/thrust/system/omp/detail/copy.inl b/compat/thrust/system/omp/detail/copy.inl
new file mode 100644
index 0000000..915ff92
--- /dev/null
+++ b/compat/thrust/system/omp/detail/copy.inl
@@ -0,0 +1,147 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/detail/type_traits/minimum_type.h>
+#include <thrust/system/cpp/detail/copy.h>
+#include <thrust/iterator/detail/retag.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+namespace dispatch
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::incrementable_traversal_tag)
+{
+  return thrust::system::cpp::detail::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::random_access_traversal_tag)
+{
+  // XXX WAR problems reconciling unrelated types such as omp & tbb
+  // reinterpret iterators as the policy we were passed
+  // this ensures that generic::copy's implementation, which eventually results in
+  // zip_iterator works correctly
+  thrust::detail::tagged_iterator<OutputIterator,DerivedPolicy> retagged_result(result);
+
+  return thrust::system::detail::generic::copy(exec, thrust::reinterpret_tag<DerivedPolicy>(first), thrust::reinterpret_tag<DerivedPolicy>(last), retagged_result).base();
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::incrementable_traversal_tag)
+{
+  return thrust::system::cpp::detail::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::random_access_traversal_tag)
+{
+  // XXX WAR problems reconciling unrelated types such as omp & tbb
+  // reinterpret iterators as the policy we were passed
+  // this ensures that generic::copy's implementation, which eventually results in
+  // zip_iterator works correctly
+  thrust::detail::tagged_iterator<OutputIterator,DerivedPolicy> retagged_result(result);
+
+  return thrust::system::detail::generic::copy_n(exec, thrust::reinterpret_tag<DerivedPolicy>(first), n, retagged_result).base();
+} // end copy_n()
+
+} // end dispatch
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::omp::detail::dispatch::copy(exec, first,last,result,traversal());
+} // end copy()
+
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::omp::detail::dispatch::copy_n(exec,first,n,result,traversal());
+} // end copy_n()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/copy_if.h b/compat/thrust/system/omp/detail/copy_if.h
new file mode 100644
index 0000000..46754a9
--- /dev/null
+++ b/compat/thrust/system/omp/detail/copy_if.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
+#include <thrust/system/omp/detail/copy_if.inl>
+
diff --git a/compat/thrust/system/omp/detail/copy_if.inl b/compat/thrust/system/omp/detail/copy_if.inl
new file mode 100644
index 0000000..1af6a21
--- /dev/null
+++ b/compat/thrust/system/omp/detail/copy_if.inl
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/detail/generic/copy_if.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  // omp prefers generic::copy_if to cpp::copy_if
+  return thrust::system::detail::generic::copy_if(exec, first, last, stencil, result, pred);
+} // end copy_if()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/count.h b/compat/thrust/system/omp/detail/count.h
new file mode 100644
index 0000000..da31ee8
--- /dev/null
+++ b/compat/thrust/system/omp/detail/count.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits count
+#include <thrust/system/cpp/detail/count.h>
+
diff --git a/compat/thrust/system/omp/detail/default_decomposition.h b/compat/thrust/system/omp/detail/default_decomposition.h
new file mode 100644
index 0000000..f1904c2
--- /dev/null
+++ b/compat/thrust/system/omp/detail/default_decomposition.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file default_decomposition.h
+ *  \brief Return a decomposition that is appropriate for the OpenMP backend.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/internal/decompose.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/default_decomposition.inl>
+
diff --git a/compat/thrust/system/omp/detail/default_decomposition.inl b/compat/thrust/system/omp/detail/default_decomposition.inl
new file mode 100644
index 0000000..366b4f5
--- /dev/null
+++ b/compat/thrust/system/omp/detail/default_decomposition.inl
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
+
+// don't attempt to #include this file without omp support
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+#include <omp.h>
+#endif // omp support
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename IndexType>
+thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to OpenMP support in your compiler.                         X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<IndexType,
+                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, omp_get_num_procs());
+#else
+  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, 1);
+#endif
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/equal.h b/compat/thrust/system/omp/detail/equal.h
new file mode 100644
index 0000000..74e5518
--- /dev/null
+++ b/compat/thrust/system/omp/detail/equal.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits equal
+#include <thrust/system/cpp/detail/equal.h>
+
diff --git a/compat/thrust/system/omp/detail/execution_policy.h b/compat/thrust/system/omp/detail/execution_policy.h
new file mode 100644
index 0000000..1b06224
--- /dev/null
+++ b/compat/thrust/system/omp/detail/execution_policy.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace omp
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::system::cpp::detail::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::system::cpp::detail::execution_policy<Derived>
+{
+  // allow conversion to tag
+  inline operator tag () const
+  {
+    return tag();
+  }
+};
+
+
+// overloads of select_system
+
+// XXX select_system(tbb, omp) & select_system(omp, tbb) are ambiguous
+//     because both convert to cpp without these overloads, which we
+//     arbitrarily define in the omp backend
+
+template<typename System1, typename System2>
+inline __host__ __device__
+  System1 select_system(execution_policy<System1> s, thrust::system::tbb::detail::execution_policy<System2>)
+{
+  return thrust::detail::derived_cast(s);
+} // end select_system()
+
+
+template<typename System1, typename System2>
+inline __host__ __device__
+  System2 select_system(thrust::system::tbb::detail::execution_policy<System1>, execution_policy<System2> s)
+{
+  return thrust::detail::derived_cast(s);
+} // end select_system()
+
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::omp::detail::execution_policy;
+using thrust::system::omp::detail::tag;
+
+} // end omp
+} // end system
+
+// alias items at top-level
+namespace omp
+{
+
+using thrust::system::omp::execution_policy;
+using thrust::system::omp::tag;
+
+} // end omp
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/extrema.h b/compat/thrust/system/omp/detail/extrema.h
new file mode 100644
index 0000000..fb96770
--- /dev/null
+++ b/compat/thrust/system/omp/detail/extrema.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/system/detail/generic/extrema.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // omp prefers generic::max_element to cpp::max_element
+  return thrust::system::detail::generic::max_element(exec, first, last, comp);
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // omp prefers generic::min_element to cpp::min_element
+  return thrust::system::detail::generic::min_element(exec, first, last, comp);
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // omp prefers generic::minmax_element to cpp::minmax_element
+  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
+} // end minmax_element()
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
+
diff --git a/compat/thrust/system/omp/detail/fill.h b/compat/thrust/system/omp/detail/fill.h
new file mode 100644
index 0000000..5219e1c
--- /dev/null
+++ b/compat/thrust/system/omp/detail/fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits fill
+#include <thrust/system/cpp/detail/fill.h>
+
diff --git a/compat/thrust/system/omp/detail/find.h b/compat/thrust/system/omp/detail/find.h
new file mode 100644
index 0000000..a8dca5a
--- /dev/null
+++ b/compat/thrust/system/omp/detail/find.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file find.h
+ *  \brief OpenMP implementation of find_if. 
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // omp prefers generic::find_if to cpp::find_if
+  return thrust::system::detail::generic::find_if(exec, first, last, pred);
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/for_each.h b/compat/thrust/system/omp/detail/for_each.h
new file mode 100644
index 0000000..1030623
--- /dev/null
+++ b/compat/thrust/system/omp/detail/for_each.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.h
+ *  \brief Defines the interface for a function that executes a 
+ *  function or functional for each value in a given range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator first,
+                                  Size n,
+                                  UnaryFunction f);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/for_each.inl>
+
diff --git a/compat/thrust/system/omp/detail/for_each.inl b/compat/thrust/system/omp/detail/for_each.inl
new file mode 100644
index 0000000..c6ab827
--- /dev/null
+++ b/compat/thrust/system/omp/detail/for_each.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file for_each.inl
+ *  \brief Inline file for for_each.h.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+#include <thrust/detail/function.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator first,
+                                Size n,
+                                UnaryFunction f)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
+                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+
+  if (n <= 0) return first;  //empty range
+
+  // create a wrapped function for f
+  typedef typename thrust::iterator_reference<RandomAccessIterator>::type reference;
+  thrust::detail::host_function<UnaryFunction,void> wrapped_f(f);
+
+// do not attempt to compile the body of this function, which depends on #pragma omp,
+// without support from the compiler
+// XXX implement the body of this function in another file to eliminate this ugliness
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  // use a signed type for the iteration variable or suffer the consequences of warnings
+  typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
+  DifferenceType signed_n = n;
+#pragma omp parallel for
+  for(DifferenceType i = 0;
+      i < signed_n;
+      ++i)
+  {
+    RandomAccessIterator temp = first + i;
+    wrapped_f(*temp);
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+
+  return first + n;
+} // end for_each_n() 
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f)
+{
+  return omp::detail::for_each_n(s, first, thrust::distance(first,last), f);
+} // end for_each()
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/gather.h b/compat/thrust/system/omp/detail/gather.h
new file mode 100644
index 0000000..dfb7d7f
--- /dev/null
+++ b/compat/thrust/system/omp/detail/gather.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits gather
+#include <thrust/system/cpp/detail/gather.h>
+
diff --git a/compat/thrust/system/omp/detail/generate.h b/compat/thrust/system/omp/detail/generate.h
new file mode 100644
index 0000000..0cb33b9
--- /dev/null
+++ b/compat/thrust/system/omp/detail/generate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits generate
+#include <thrust/system/cpp/detail/generate.h>
+
diff --git a/compat/thrust/system/omp/detail/get_value.h b/compat/thrust/system/omp/detail/get_value.h
new file mode 100644
index 0000000..e376e65
--- /dev/null
+++ b/compat/thrust/system/omp/detail/get_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits get_value
+#include <thrust/system/cpp/detail/get_value.h>
+
diff --git a/compat/thrust/system/omp/detail/inner_product.h b/compat/thrust/system/omp/detail/inner_product.h
new file mode 100644
index 0000000..351421a
--- /dev/null
+++ b/compat/thrust/system/omp/detail/inner_product.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits inner_product
+#include <thrust/system/cpp/detail/inner_product.h>
+
diff --git a/compat/thrust/system/omp/detail/iter_swap.h b/compat/thrust/system/omp/detail/iter_swap.h
new file mode 100644
index 0000000..16176ec
--- /dev/null
+++ b/compat/thrust/system/omp/detail/iter_swap.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits iter_swap
+#include <thrust/system/cpp/detail/iter_swap.h>
+
diff --git a/compat/thrust/system/omp/detail/logical.h b/compat/thrust/system/omp/detail/logical.h
new file mode 100644
index 0000000..b2a80de
--- /dev/null
+++ b/compat/thrust/system/omp/detail/logical.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits logical
+#include <thrust/system/cpp/detail/logical.h>
+
diff --git a/compat/thrust/system/omp/detail/malloc_and_free.h b/compat/thrust/system/omp/detail/malloc_and_free.h
new file mode 100644
index 0000000..811a552
--- /dev/null
+++ b/compat/thrust/system/omp/detail/malloc_and_free.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits malloc and free
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+
diff --git a/compat/thrust/system/omp/detail/memory.inl b/compat/thrust/system/omp/detail/memory.inl
new file mode 100644
index 0000000..7d53de6
--- /dev/null
+++ b/compat/thrust/system/omp/detail/memory.inl
@@ -0,0 +1,110 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/omp/memory.h>
+#include <thrust/system/cpp/memory.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+namespace detail
+{
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
+//     is not defined
+//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  pointer<void> malloc_workaround(Tag t, std::size_t n)
+{
+  return pointer<void>(malloc(t, n));
+} // end malloc_workaround()
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::free
+//     is not defined
+//     WAR the problem by using adl to call cpp::free, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  void free_workaround(Tag t, pointer<void> ptr)
+{
+  free(t, ptr.get());
+} // end free_workaround()
+
+} // end detail
+
+inline pointer<void> malloc(std::size_t n)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // return pointer<void>(thrust::system::cpp::malloc(n))
+  //
+  return detail::malloc_workaround(cpp::tag(), n);
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::omp::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+inline void free(pointer<void> ptr)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // thrust::system::cpp::free(ptr)
+  //
+  detail::free_workaround(cpp::tag(), ptr);
+} // end free()
+
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/merge.h b/compat/thrust/system/omp/detail/merge.h
new file mode 100644
index 0000000..a7047aa
--- /dev/null
+++ b/compat/thrust/system/omp/detail/merge.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits merge
+#include <thrust/system/cpp/detail/merge.h>
+
diff --git a/compat/thrust/system/omp/detail/mismatch.h b/compat/thrust/system/omp/detail/mismatch.h
new file mode 100644
index 0000000..03980cf
--- /dev/null
+++ b/compat/thrust/system/omp/detail/mismatch.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits mismatch
+#include <thrust/system/cpp/detail/mismatch.h>
+
diff --git a/compat/thrust/system/omp/detail/par.h b/compat/thrust/system/omp/detail/par.h
new file mode 100644
index 0000000..fa6d18e
--- /dev/null
+++ b/compat/thrust/system/omp/detail/par.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/detail/execute_with_allocator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::omp::detail::execution_policy<par_t>
+{
+  par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
+
+  template<typename Allocator>
+    thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>
+      operator()(Allocator &alloc) const
+  {
+    return thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>(alloc);
+  }
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end omp
+} // end system
+
+
+// alias par here
+namespace omp
+{
+
+
+using thrust::system::omp::par;
+
+
+} // end omp
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/partition.h b/compat/thrust/system/omp/detail/partition.h
new file mode 100644
index 0000000..edcbc30
--- /dev/null
+++ b/compat/thrust/system/omp/detail/partition.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/partition.inl>
+
diff --git a/compat/thrust/system/omp/detail/partition.inl b/compat/thrust/system/omp/detail/partition.inl
new file mode 100644
index 0000000..da629e5
--- /dev/null
+++ b/compat/thrust/system/omp/detail/partition.inl
@@ -0,0 +1,108 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/detail/generic/partition.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // omp prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // omp prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/reduce.h b/compat/thrust/system/omp/detail/reduce.h
new file mode 100644
index 0000000..0cc5ceb
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce.inl>
+
diff --git a/compat/thrust/system/omp/detail/reduce.inl b/compat/thrust/system/omp/detail/reduce.inl
new file mode 100644
index 0000000..1347bfd
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce.inl
@@ -0,0 +1,72 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
+#include <thrust/system/omp/detail/reduce_intervals.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;
+
+  const difference_type n = thrust::distance(first,last);
+
+  // determine first and second level decomposition
+  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp1 = thrust::system::omp::detail::default_decomposition(n);
+  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp2(decomp1.size() + 1, 1, 1);
+
+  // allocate storage for the initializer and partial sums
+  // XXX use select_system for Tag
+  thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp1.size() + 1);
+  
+  // set first element of temp array to init
+  partial_sums[0] = init;
+  
+  // accumulate partial sums (first level reduction)
+  thrust::system::omp::detail::reduce_intervals(exec, first, partial_sums.begin() + 1, binary_op, decomp1);
+
+  // reduce partial sums (second level reduction)
+  thrust::system::omp::detail::reduce_intervals(exec, partial_sums.begin(), partial_sums.begin(), binary_op, decomp2);
+
+  return partial_sums[0];
+} // end reduce()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/reduce_by_key.h b/compat/thrust/system/omp/detail/reduce_by_key.h
new file mode 100644
index 0000000..d7243ee
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce_by_key.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief OpenMP implementation of reduce algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce_by_key.inl>
+
diff --git a/compat/thrust/system/omp/detail/reduce_by_key.inl b/compat/thrust/system/omp/detail/reduce_by_key.inl
new file mode 100644
index 0000000..91402d8
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce_by_key.inl
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/detail/generic/reduce_by_key.h>
+#include <thrust/distance.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate,
+          typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+  // omp prefers generic::reduce_by_key to cpp::reduce_by_key
+  return thrust::system::detail::generic::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+} // end reduce_by_key()
+
+
+} // end detail
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/detail/reduce_intervals.h b/compat/thrust/system/omp/detail/reduce_intervals.h
new file mode 100644
index 0000000..7bce207
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce_intervals.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce_intervals.h
+ *  \brief OpenMP implementations of reduce_intervals algorithms.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<DerivedPolicy> &exec,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/reduce_intervals.inl>
+
diff --git a/compat/thrust/system/omp/detail/reduce_intervals.inl b/compat/thrust/system/omp/detail/reduce_intervals.inl
new file mode 100644
index 0000000..0752b8a
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reduce_intervals.inl
@@ -0,0 +1,93 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/reduce_intervals.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/cstdint.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryFunction,
+          typename Decomposition>
+void reduce_intervals(execution_policy<DerivedPolicy> &,
+                      InputIterator input,
+                      OutputIterator output,
+                      BinaryFunction binary_op,
+                      Decomposition decomp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator,
+                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
+
+  // wrap binary_op
+  thrust::detail::host_function<BinaryFunction,OutputType> wrapped_binary_op(binary_op);
+
+  typedef thrust::detail::intptr_t index_type;
+
+  index_type n = static_cast<index_type>(decomp.size());
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+# pragma omp parallel for
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+  for(index_type i = 0; i < n; i++)
+  {
+    InputIterator begin = input + decomp[i].begin();
+    InputIterator end   = input + decomp[i].end();
+
+    if (begin != end)
+    {
+      OutputType sum = thrust::raw_reference_cast(*begin);
+
+      ++begin;
+
+      while (begin != end)
+      {
+        sum = wrapped_binary_op(sum, *begin);
+        ++begin;
+      }
+
+      OutputIterator tmp = output + i;
+      *tmp = sum;
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/remove.h b/compat/thrust/system/omp/detail/remove.h
new file mode 100644
index 0000000..ebcb496
--- /dev/null
+++ b/compat/thrust/system/omp/detail/remove.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/remove.inl>
+
diff --git a/compat/thrust/system/omp/detail/remove.inl b/compat/thrust/system/omp/detail/remove.inl
new file mode 100644
index 0000000..c056f96
--- /dev/null
+++ b/compat/thrust/system/omp/detail/remove.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/detail/generic/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // omp prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // omp prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
+}
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/replace.h b/compat/thrust/system/omp/detail/replace.h
new file mode 100644
index 0000000..c48555d
--- /dev/null
+++ b/compat/thrust/system/omp/detail/replace.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/compat/thrust/system/omp/detail/reverse.h b/compat/thrust/system/omp/detail/reverse.h
new file mode 100644
index 0000000..04923d1
--- /dev/null
+++ b/compat/thrust/system/omp/detail/reverse.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reverse
+#include <thrust/system/cpp/detail/reverse.h>
+
diff --git a/compat/thrust/system/omp/detail/scan.h b/compat/thrust/system/omp/detail/scan.h
new file mode 100644
index 0000000..c105951
--- /dev/null
+++ b/compat/thrust/system/omp/detail/scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits scan
+#include <thrust/system/cpp/detail/scan.h>
+
diff --git a/compat/thrust/system/omp/detail/scan_by_key.h b/compat/thrust/system/omp/detail/scan_by_key.h
new file mode 100644
index 0000000..bfbd5d6
--- /dev/null
+++ b/compat/thrust/system/omp/detail/scan_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scan_by_key.h>
+
diff --git a/compat/thrust/system/omp/detail/scatter.h b/compat/thrust/system/omp/detail/scatter.h
new file mode 100644
index 0000000..c48555d
--- /dev/null
+++ b/compat/thrust/system/omp/detail/scatter.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/compat/thrust/system/omp/detail/sequence.h b/compat/thrust/system/omp/detail/sequence.h
new file mode 100644
index 0000000..811d8f5
--- /dev/null
+++ b/compat/thrust/system/omp/detail/sequence.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits sequence
+#include <thrust/system/cpp/detail/sequence.h>
+
diff --git a/compat/thrust/system/omp/detail/set_operations.h b/compat/thrust/system/omp/detail/set_operations.h
new file mode 100644
index 0000000..687edb2
--- /dev/null
+++ b/compat/thrust/system/omp/detail/set_operations.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits set_operations
+#include <thrust/system/cpp/detail/set_operations.h>
+
diff --git a/compat/thrust/system/omp/detail/sort.h b/compat/thrust/system/omp/detail/sort.h
new file mode 100644
index 0000000..9a480f2
--- /dev/null
+++ b/compat/thrust/system/omp/detail/sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp);
+    
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 keys_first,
+                        RandomAccessIterator1 keys_last,
+                        RandomAccessIterator2 values_first,
+                        StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/sort.inl>
+
diff --git a/compat/thrust/system/omp/detail/sort.inl b/compat/thrust/system/omp/detail/sort.inl
new file mode 100644
index 0000000..ab4f4a1
--- /dev/null
+++ b/compat/thrust/system/omp/detail/sort.inl
@@ -0,0 +1,249 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#include <thrust/detail/config.h>
+
+// don't attempt to #include this file without omp support
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+#include <omp.h>
+#endif // omp support
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/cpp/detail/sort.h>
+#include <thrust/system/cpp/detail/merge.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/temporary_array.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+namespace sort_detail
+{
+
+
+template <typename DerivedPolicy,
+          typename RandomAccessIterator,
+          typename StrictWeakOrdering>
+void inplace_merge(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator middle,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
+
+  thrust::detail::temporary_array<value_type,DerivedPolicy> a(exec, first, middle);
+  thrust::detail::temporary_array<value_type,DerivedPolicy> b(exec, middle, last);
+
+  thrust::system::cpp::detail::merge(exec, a.begin(), a.end(), b.begin(), b.end(), first, comp);
+}
+
+
+template <typename DerivedPolicy,
+          typename RandomAccessIterator1,
+          typename RandomAccessIterator2,
+          typename StrictWeakOrdering>
+void inplace_merge_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 first1,
+                          RandomAccessIterator1 middle1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
+
+  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
+  RandomAccessIterator2 last2   = first2 + (last1   - first1);
+
+  thrust::detail::temporary_array<value_type1,DerivedPolicy> lhs1(exec, first1, middle1);
+  thrust::detail::temporary_array<value_type1,DerivedPolicy> rhs1(exec, middle1, last1);
+  thrust::detail::temporary_array<value_type2,DerivedPolicy> lhs2(exec, first2, middle2);
+  thrust::detail::temporary_array<value_type2,DerivedPolicy> rhs2(exec, middle2, last2);
+
+  thrust::system::cpp::detail::merge_by_key
+    (exec,
+     lhs1.begin(), lhs1.end(), rhs1.begin(), rhs1.end(),
+     lhs2.begin(), rhs2.begin(),
+     first1, first2, comp);
+}
+
+
+} // end sort_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
+                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
+  
+  if (first == last)
+    return;
+
+  #pragma omp parallel
+  {
+    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
+
+    // process id
+    IndexType p_i = omp_get_thread_num();
+
+    // every thread sorts its own tile
+    if (p_i < decomp.size())
+    {
+      thrust::system::cpp::detail::stable_sort(exec,
+                                               first + decomp[p_i].begin(),
+                                               first + decomp[p_i].end(),
+                                               comp);
+    }
+
+    #pragma omp barrier
+
+    IndexType nseg = decomp.size();
+    IndexType h = 2;
+
+    // keep track of which sub-range we're processing
+    IndexType a=p_i, b=p_i, c=p_i+1;
+
+    while( nseg>1 )
+    {
+        if(c >= decomp.size())
+          c = decomp.size() - 1;
+
+        if((p_i % h) == 0 && c > b)
+        {
+          thrust::system::omp::detail::sort_detail::inplace_merge
+            (exec,
+             first + decomp[a].begin(),
+             first + decomp[b].end(),
+             first + decomp[c].end(),
+             comp);
+            b = c;
+            c += h;
+        }
+
+        nseg = (nseg + 1) / 2;
+        h *= 2;
+
+        #pragma omp barrier
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 keys_first,
+                        RandomAccessIterator1 keys_last,
+                        RandomAccessIterator2 values_first,
+                        StrictWeakOrdering comp)
+{
+  // we're attempting to launch an omp kernel, assert we're compiling with omp support
+  // ========================================================================
+  // X Note to the user: If you've found this line due to a compiler error, X
+  // X you need to enable OpenMP support in your compiler.                  X
+  // ========================================================================
+  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1,
+                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+
+#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
+  
+  if (keys_first == keys_last)
+    return;
+
+  #pragma omp parallel
+  {
+    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
+
+    // process id
+    IndexType p_i = omp_get_thread_num();
+
+    // every thread sorts its own tile
+    if (p_i < decomp.size())
+    {
+      thrust::system::cpp::detail::stable_sort_by_key(exec,
+                                                      keys_first + decomp[p_i].begin(),
+                                                      keys_first + decomp[p_i].end(),
+                                                      values_first + decomp[p_i].begin(),
+                                                      comp);
+    }
+
+    #pragma omp barrier
+
+    IndexType nseg = decomp.size();
+    IndexType h = 2;
+
+    // keep track of which sub-range we're processing
+    IndexType a=p_i, b=p_i, c=p_i+1;
+
+    while( nseg>1 )
+    {
+        if(c >= decomp.size())
+          c = decomp.size() - 1;
+
+        if((p_i % h) == 0 && c > b)
+        {
+          thrust::system::omp::detail::sort_detail::inplace_merge_by_key
+            (exec,
+             keys_first + decomp[a].begin(),
+             keys_first + decomp[b].end(),
+             keys_first + decomp[c].end(),
+             values_first + decomp[a].begin(),
+             comp);
+            b = c;
+            c += h;
+        }
+
+        nseg = (nseg + 1) / 2;
+        h *= 2;
+
+        #pragma omp barrier
+    }
+  }
+#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+}
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/swap_ranges.h b/compat/thrust/system/omp/detail/swap_ranges.h
new file mode 100644
index 0000000..e683aaa
--- /dev/null
+++ b/compat/thrust/system/omp/detail/swap_ranges.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits swap_ranges
+#include <thrust/system/cpp/detail/swap_ranges.h>
+
diff --git a/compat/thrust/system/omp/detail/tabulate.h b/compat/thrust/system/omp/detail/tabulate.h
new file mode 100644
index 0000000..da65d8e
--- /dev/null
+++ b/compat/thrust/system/omp/detail/tabulate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits tabulate
+#include <thrust/system/cpp/detail/tabulate.h>
+
diff --git a/compat/thrust/system/omp/detail/temporary_buffer.h b/compat/thrust/system/omp/detail/temporary_buffer.h
new file mode 100644
index 0000000..628bd75
--- /dev/null
+++ b/compat/thrust/system/omp/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/compat/thrust/system/omp/detail/transform.h b/compat/thrust/system/omp/detail/transform.h
new file mode 100644
index 0000000..70ce1f4
--- /dev/null
+++ b/compat/thrust/system/omp/detail/transform.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits transform
+#include <thrust/system/cpp/detail/transform.h>
+
diff --git a/compat/thrust/system/omp/detail/transform_reduce.h b/compat/thrust/system/omp/detail/transform_reduce.h
new file mode 100644
index 0000000..23ed070
--- /dev/null
+++ b/compat/thrust/system/omp/detail/transform_reduce.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_reduce
+#include <thrust/system/cpp/detail/transform_reduce.h>
+
diff --git a/compat/thrust/system/omp/detail/transform_scan.h b/compat/thrust/system/omp/detail/transform_scan.h
new file mode 100644
index 0000000..fc2e55d
--- /dev/null
+++ b/compat/thrust/system/omp/detail/transform_scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_scan
+#include <thrust/system/cpp/detail/transform_scan.h>
+
diff --git a/compat/thrust/system/omp/detail/uninitialized_copy.h b/compat/thrust/system/omp/detail/uninitialized_copy.h
new file mode 100644
index 0000000..944f4ba
--- /dev/null
+++ b/compat/thrust/system/omp/detail/uninitialized_copy.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_copy
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+
diff --git a/compat/thrust/system/omp/detail/uninitialized_fill.h b/compat/thrust/system/omp/detail/uninitialized_fill.h
new file mode 100644
index 0000000..b9d6de2
--- /dev/null
+++ b/compat/thrust/system/omp/detail/uninitialized_fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_fill
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+
diff --git a/compat/thrust/system/omp/detail/unique.h b/compat/thrust/system/omp/detail/unique.h
new file mode 100644
index 0000000..60c617b
--- /dev/null
+++ b/compat/thrust/system/omp/detail/unique.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/unique.inl>
+
diff --git a/compat/thrust/system/omp/detail/unique.inl b/compat/thrust/system/omp/detail/unique.inl
new file mode 100644
index 0000000..d66ac3b
--- /dev/null
+++ b/compat/thrust/system/omp/detail/unique.inl
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique to cpp::unique
+  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_copy to cpp::unique_copy
+  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
+} // end unique_copy()
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/unique_by_key.h b/compat/thrust/system/omp/detail/unique_by_key.h
new file mode 100644
index 0000000..8fdde66
--- /dev/null
+++ b/compat/thrust/system/omp/detail/unique_by_key.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace omp 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/omp/detail/unique_by_key.inl>
+
diff --git a/compat/thrust/system/omp/detail/unique_by_key.inl b/compat/thrust/system/omp/detail/unique_by_key.inl
new file mode 100644
index 0000000..644b5ed
--- /dev/null
+++ b/compat/thrust/system/omp/detail/unique_by_key.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_by_key to cpp::unique_by_key
+  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
+} // end unique_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
+  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/omp/detail/vector.inl b/compat/thrust/system/omp/detail/vector.inl
new file mode 100644
index 0000000..32c845c
--- /dev/null
+++ b/compat/thrust/system/omp/detail/vector.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/vector.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector()
+      : super_t()
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n)
+      : super_t(n)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n, const value_type &value)
+      : super_t(n,value)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(const vector &x)
+      : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const std::vector<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename InputIterator>
+    vector<T,Allocator>
+      ::vector(InputIterator first, InputIterator last)
+        : super_t(first,last)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+      
+} // end omp
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/omp/execution_policy.h b/compat/thrust/system/omp/execution_policy.h
new file mode 100644
index 0000000..7d5d1d8
--- /dev/null
+++ b/compat/thrust/system/omp/execution_policy.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/omp/execution_policy.h
+ *  \brief Execution policies for Thrust's OpenMP system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/omp/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/omp/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/omp/detail/adjacent_difference.h>
+#include <thrust/system/omp/detail/assign_value.h>
+#include <thrust/system/omp/detail/binary_search.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/omp/detail/count.h>
+#include <thrust/system/omp/detail/equal.h>
+#include <thrust/system/omp/detail/extrema.h>
+#include <thrust/system/omp/detail/fill.h>
+#include <thrust/system/omp/detail/find.h>
+#include <thrust/system/omp/detail/for_each.h>
+#include <thrust/system/omp/detail/gather.h>
+#include <thrust/system/omp/detail/generate.h>
+#include <thrust/system/omp/detail/get_value.h>
+#include <thrust/system/omp/detail/inner_product.h>
+#include <thrust/system/omp/detail/iter_swap.h>
+#include <thrust/system/omp/detail/logical.h>
+#include <thrust/system/omp/detail/malloc_and_free.h>
+#include <thrust/system/omp/detail/merge.h>
+#include <thrust/system/omp/detail/mismatch.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/omp/detail/replace.h>
+#include <thrust/system/omp/detail/reverse.h>
+#include <thrust/system/omp/detail/scan.h>
+#include <thrust/system/omp/detail/scan_by_key.h>
+#include <thrust/system/omp/detail/scatter.h>
+#include <thrust/system/omp/detail/sequence.h>
+#include <thrust/system/omp/detail/set_operations.h>
+#include <thrust/system/omp/detail/sort.h>
+#include <thrust/system/omp/detail/swap_ranges.h>
+#include <thrust/system/omp/detail/tabulate.h>
+#include <thrust/system/omp/detail/transform.h>
+#include <thrust/system/omp/detail/transform_reduce.h>
+#include <thrust/system/omp/detail/transform_scan.h>
+#include <thrust/system/omp/detail/uninitialized_copy.h>
+#include <thrust/system/omp/detail/uninitialized_fill.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::omp::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's OpenMP backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p omp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p omp::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p omp system.
+ */
+struct tag : thrust::system::omp::execution_policy<tag> { unspecified };
+
+
+/*! \p thrust::omp::par is the parallel execution policy associated with Thrust's OpenMP
+ *  backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's OpenMP backend system by providing \p thrust::omp::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::omp::vector.
+ *
+ *  The type of \p thrust::omp::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::omp::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the OpenMP backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/omp/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::omp::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end cpp
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/compat/thrust/system/omp/memory.h b/compat/thrust/system/omp/memory.h
new file mode 100644
index 0000000..0a23434
--- /dev/null
+++ b/compat/thrust/system/omp/memory.h
@@ -0,0 +1,414 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/memory.h
+ *  \brief Managing memory associated with Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+
+template<typename> class pointer;
+
+} // end omp
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize std::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace std
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::omp::pointer<Element> >
+{
+  private:
+    typedef thrust::system::omp::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end std
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::omp
+ *  \brief \p thrust::system::omp is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's OpenMP backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::omp</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace omp
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::omp::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the omp system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in omp memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p omp::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see omp::malloc
+ *  \see omp::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::omp::tag,
+               thrust::system::omp::reference<T>,
+               thrust::system::omp::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::omp::tag,
+      //thrust::system::omp::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::omp::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that omp::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p omp system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p omp system.
+ *  \p reference is the type of the result of dereferencing a \p omp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::omp::pointer<T>,
+               thrust::system::omp::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::omp::pointer<T>,
+      thrust::system::omp::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference ot interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+/*! Allocates an area of memory available to Thrust's <tt>omp</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>omp::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>omp::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>omp::pointer<void></tt> returned by this function must be
+ *        deallocated with \p omp::free.
+ *  \see omp::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>omp</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>omp::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>omp::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>omp::pointer<T></tt> returned by this function must be
+ *        deallocated with \p omp::free.
+ *  \see omp::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>omp::malloc</tt>.
+ *  \param ptr A <tt>omp::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>omp::malloc</tt>.
+ *  \see omp::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+// XXX upon c++11
+// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
+
+/*! \p omp::allocator is the default allocator used by the \p omp system's containers such as
+ *  <tt>omp::vector</tt> if no user-specified allocator is provided. \p omp::allocator allocates
+ *  (deallocates) storage with \p omp::malloc (\p omp::free).
+ */
+template<typename T>
+  struct allocator
+    : thrust::detail::malloc_allocator<
+        T,
+        tag,
+        pointer<T>
+      >
+{
+  /*! The \p rebind metafunction provides the type of an \p allocator
+   *  instantiated with another type.
+   *
+   *  \tparam U The other type to use for instantiation.
+   */
+  template<typename U>
+    struct rebind
+  {
+    /*! The typedef \p other gives the type of the rebound \p allocator.
+     */
+    typedef allocator<U> other;
+  };
+
+  /*! No-argument constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator() {}
+
+  /*! Copy constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator(const allocator &) {}
+
+  /*! Constructor from other \p allocator has no effect.
+   */
+  template<typename U>
+  __host__ __device__
+  inline allocator(const allocator<U> &) {}
+
+  /*! Destructor has no effect.
+   */
+  __host__ __device__
+  inline ~allocator() {}
+}; // end allocator
+
+} // end omp
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::omp
+ *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
+ */
+namespace omp
+{
+
+using thrust::system::omp::pointer;
+using thrust::system::omp::reference;
+using thrust::system::omp::malloc;
+using thrust::system::omp::free;
+using thrust::system::omp::allocator;
+
+} // end omp
+
+} // end thrust
+
+#include <thrust/system/omp/detail/memory.inl>
+
diff --git a/compat/thrust/system/omp/vector.h b/compat/thrust/system/omp/vector.h
new file mode 100644
index 0000000..5f45a91
--- /dev/null
+++ b/compat/thrust/system/omp/vector.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+
+// forward declaration of host_vector
+// XXX why is this here? it doesn't seem necessary for anything below
+template<typename T, typename Allocator> class host_vector;
+
+namespace system
+{
+namespace omp
+{
+
+// XXX upon c++11
+// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
+
+/*! \p omp::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p omp::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in an \p omp::vector reside in memory
+ *  available to the \p omp system.
+ *
+ *  \tparam T The element type of the \p omp::vector.
+ *  \tparam Allocator The allocator type of the \p omp::vector. Defaults to \p omp::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p omp::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+  class vector
+    : public thrust::detail::vector_base<T,Allocator>
+{
+  /*! \cond
+   */
+  private:
+    typedef thrust::detail::vector_base<T,Allocator> super_t;
+  /*! \endcond
+   */
+
+  public:
+    
+  /*! \cond
+   */
+    typedef typename super_t::size_type  size_type;
+    typedef typename super_t::value_type value_type;
+  /*! \endcond
+   */
+
+    /*! This constructor creates an empty \p omp::vector.
+     */
+    vector();
+
+    /*! This constructor creates a \p omp::vector with \p n default-constructed elements.
+     *  \param n The size of the \p omp::vector to create.
+     */
+    explicit vector(size_type n);
+
+    /*! This constructor creates a \p omp::vector with \p n copies of \p value.
+     *  \param n The size of the \p omp::vector to create.
+     *  \param value An element to copy.
+     */
+    explicit vector(size_type n, const value_type &value);
+
+    /*! Copy constructor copies from another \p omp::vector.
+     *  \param x The other \p omp::vector to copy.
+     */
+    vector(const vector &x);
+
+    /*! This constructor copies from another Thrust vector-like object.
+     *  \param x The other object to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+
+    /*! This constructor copies from a \c std::vector.
+     *  \param x The \c std::vector to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! This constructor creates an \p omp::vector by copying from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+
+    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
+
+    /*! Assignment operator assigns from a \c std::vector.
+     *  \param x The \c std::vector to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! Assignment operator assigns from another Thrust vector-like object.
+     *  \param x The other object to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+}; // end vector
+
+} // end omp
+} // end system
+
+// alias system::omp names at top-level
+namespace omp
+{
+
+using thrust::system::omp::vector;
+
+} // end omp
+
+} // end thrust
+
+#include <thrust/system/omp/detail/vector.inl>
+
diff --git a/compat/thrust/system/system_error.h b/compat/thrust/system/system_error.h
new file mode 100644
index 0000000..6f94b61
--- /dev/null
+++ b/compat/thrust/system/system_error.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file system/system_error.h
+ *  \brief An exception object used to report error conditions that have an
+ *         associated error code
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <stdexcept>
+#include <string>
+
+#include <thrust/system/error_code.h>
+
+namespace thrust
+{
+
+namespace system
+{
+
+// [19.5.5] Class system_error
+
+// [19.5.5.1] Class system_error overview
+
+/*! \addtogroup system_diagnostics System Diagnostics
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \brief The class \p system_error describes an exception object used to report error
+ *  conditions that have an associated \p error_code. Such error conditions typically
+ *  originate from the operating system or other low-level application program interfaces.
+ *
+ *  Thrust uses \p system_error to report the error codes returned from device backends
+ *  such as the CUDA runtime.
+ *
+ *  The following code listing demonstrates how to catch a \p system_error to recover
+ *  from an error.
+ *
+ *  \code
+ *
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/system.h>
+ *  #include <thrust/sort.h>
+ *
+ *  void terminate_gracefully(void)
+ *  {
+ *    // application-specific termination code here
+ *    ...
+ *  }
+ *
+ *  int main(void)
+ *  {
+ *    try
+ *    {
+ *      thrust::device_vector<float> vec;
+ *      thrust::sort(vec.begin(), vec.end());
+ *    }
+ *    catch(thrust::system_error e)
+ *    {
+ *      std::cerr << "Error inside sort: " << e.what() << std::endl;
+ *      terminate_gracefully();
+ *    }
+ *
+ *    return 0;
+ *  }
+ *
+ *  \endcode
+ *
+ *  \note If an error represents an out-of-memory condition, implementations are encouraged
+ *  to throw an exception object of type \p std::bad_alloc rather than \p system_error.
+ */
+class system_error
+  : public std::runtime_error
+{
+  public:
+    // [19.5.5.2] Class system_error members
+    
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == ec</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(error_code ec, const std::string &what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == ec</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(error_code ec, const char *what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ec The value returned by \p code().
+     *  \post <tt>code() == ec</tt>.
+     */
+    inline system_error(error_code ec);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat, const std::string &what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \param what_arg A string to include in the result returned by \p what().
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat, const char *what_arg);
+
+    /*! Constructs an object of class \p system_error.
+     *  \param ev The error value used to create an \p error_code.
+     *  \param ecat The \p error_category used to create an \p error_code.
+     *  \post <tt>code() == error_code(ev, ecat)</tt>.
+     */
+    inline system_error(int ev, const error_category &ecat);
+
+    /*! Destructor does not throw.
+     */
+    inline virtual ~system_error(void) throw () {};
+    
+    /*! Returns an object encoding the error.
+     *  \return <tt>ec</tt> or <tt>error_code(ev, ecat)</tt>, from the
+     *          constructor, as appropriate.
+     */
+    inline const error_code &code(void) const throw();
+
+    /*! Returns a human-readable string indicating the nature of the error.
+     *  \return a string incorporating <tt>code().message()</tt> and the
+     *          arguments supplied in the constructor.
+     */
+    inline const char *what(void) const throw();
+
+    /*! \cond
+     */
+  private:
+    error_code          m_error_code;
+    mutable std::string m_what;
+
+    /*! \endcond
+     */
+}; // end system_error
+
+} // end system
+
+/*! \} // end system_diagnostics
+ */
+
+// import names into thrust::
+using system::system_error;
+
+} // end thrust
+
+#include <thrust/system/detail/system_error.inl>
+
diff --git a/compat/thrust/system/tbb/detail/adjacent_difference.h b/compat/thrust/system/tbb/detail/adjacent_difference.h
new file mode 100644
index 0000000..37c9adc
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/adjacent_difference.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/detail/generic/adjacent_difference.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     BinaryFunction binary_op)
+{
+  // tbb prefers generic::adjacent_difference to cpp::adjacent_difference
+  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
+} // end adjacent_difference()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/assign_value.h b/compat/thrust/system/tbb/detail/assign_value.h
new file mode 100644
index 0000000..eda3b97
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/assign_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits assign_value
+#include <thrust/system/cpp/detail/assign_value.h>
+
diff --git a/compat/thrust/system/tbb/detail/binary_search.h b/compat/thrust/system/tbb/detail/binary_search.h
new file mode 100644
index 0000000..8dec989
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/binary_search.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits binary_search
+#include <thrust/system/cpp/detail/binary_search.h>
+
diff --git a/compat/thrust/system/tbb/detail/copy.h b/compat/thrust/system/tbb/detail/copy.h
new file mode 100644
index 0000000..7604e6f
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/copy.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/copy.inl>
+
diff --git a/compat/thrust/system/tbb/detail/copy.inl b/compat/thrust/system/tbb/detail/copy.inl
new file mode 100644
index 0000000..6d354d0
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/copy.inl
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/copy.h>
+#include <thrust/system/detail/generic/copy.h>
+#include <thrust/detail/type_traits/minimum_type.h>
+#include <thrust/system/cpp/detail/copy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace dispatch
+{
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::incrementable_traversal_tag)
+{
+  return thrust::system::cpp::detail::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy(exec, first, last, result);
+} // end copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::incrementable_traversal_tag)
+{
+  return thrust::system::cpp::detail::copy_n(exec, first, n, result);
+} // end copy_n()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                        InputIterator first,
+                        Size n,
+                        OutputIterator result,
+                        thrust::random_access_traversal_tag)
+{
+  return thrust::system::detail::generic::copy_n(exec, first, n, result);
+} // end copy_n()
+
+} // end dispatch
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator copy(execution_policy<DerivedPolicy> &exec,
+                    InputIterator first,
+                    InputIterator last,
+                    OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::tbb::detail::dispatch::copy(exec,first,last,result,traversal());
+} // end copy()
+
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Size,
+         typename OutputIterator>
+OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      Size n,
+                      OutputIterator result)
+{
+  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
+  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
+  
+  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
+
+  // dispatch on minimum traversal
+  return thrust::system::tbb::detail::dispatch::copy_n(exec,first,n,result,traversal());
+} // end copy_n()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/copy_if.h b/compat/thrust/system/tbb/detail/copy_if.h
new file mode 100644
index 0000000..ffbd4f8
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/copy_if.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(tag,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred);
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+#include <thrust/system/tbb/detail/copy_if.inl>
+
diff --git a/compat/thrust/system/tbb/detail/copy_if.inl b/compat/thrust/system/tbb/detail/copy_if.inl
new file mode 100644
index 0000000..4353b3b
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/copy_if.inl
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace copy_if_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate,
+         typename Size>
+struct body
+{
+
+  InputIterator1 first;
+  InputIterator2 stencil;
+  OutputIterator result;
+  thrust::detail::host_function<Predicate,bool> pred;
+  Size sum;
+
+  body(InputIterator1 first, InputIterator2 stencil, OutputIterator result, Predicate pred)
+    : first(first), stencil(stencil), result(result), pred(pred), sum(0)
+  {}
+
+  body(body& b, ::tbb::split)
+    : first(b.first), stencil(b.stencil), result(b.result), pred(b.pred), sum(0)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator2 iter = stencil + r.begin();
+
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter)
+    {
+      if (pred(*iter))
+        ++sum;
+    }
+  }
+  
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator1  iter1 = first   + r.begin();
+    InputIterator2  iter2 = stencil + r.begin();
+    OutputIterator  iter3 = result  + sum;
+      
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+    {
+      if (pred(*iter2))
+      {
+        *iter3 = *iter1;
+        ++sum;
+        ++iter3;
+      }
+    }
+  }
+
+  void reverse_join(body& b)
+  {
+    sum = b.sum + sum;
+  } 
+
+  void assign(body& b)
+  {
+    sum = b.sum;
+  } 
+}; // end body
+
+} // end copy_if_detail
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator copy_if(tag,
+                         InputIterator1 first,
+                         InputIterator1 last,
+                         InputIterator2 stencil,
+                         OutputIterator result,
+                         Predicate pred)
+{
+  typedef typename thrust::iterator_difference<InputIterator1>::type Size; 
+  typedef typename copy_if_detail::body<InputIterator1,InputIterator2,OutputIterator,Predicate,Size> Body;
+  
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    Body body(first, stencil, result, pred);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), body);
+    thrust::advance(result, body.sum);
+  }
+
+  return result;
+} // end copy_if()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/count.h b/compat/thrust/system/tbb/detail/count.h
new file mode 100644
index 0000000..da31ee8
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/count.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits count
+#include <thrust/system/cpp/detail/count.h>
+
diff --git a/compat/thrust/system/tbb/detail/equal.h b/compat/thrust/system/tbb/detail/equal.h
new file mode 100644
index 0000000..74e5518
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/equal.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits equal
+#include <thrust/system/cpp/detail/equal.h>
+
diff --git a/compat/thrust/system/tbb/detail/execution_policy.h b/compat/thrust/system/tbb/detail/execution_policy.h
new file mode 100644
index 0000000..167d1dc
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/execution_policy.h
@@ -0,0 +1,86 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/detail/type_traits.h>
+
+namespace thrust
+{
+namespace system
+{
+// put the canonical tag in the same ns as the backend's entry points
+namespace tbb
+{
+namespace detail
+{
+
+// this awkward sequence of definitions arise
+// from the desire both for tag to derive
+// from execution_policy and for execution_policy
+// to convert to tag (when execution_policy is not
+// an ancestor of tag)
+
+// forward declaration of tag
+struct tag;
+
+// forward declaration of execution_policy
+template<typename> struct execution_policy;
+
+// specialize execution_policy for tag
+template<>
+  struct execution_policy<tag>
+    : thrust::system::cpp::detail::execution_policy<tag>
+{};
+
+// tag's definition comes before the
+// generic definition of execution_policy
+struct tag : execution_policy<tag> {};
+
+// allow conversion to tag when it is not a successor
+template<typename Derived>
+  struct execution_policy
+    : thrust::system::cpp::detail::execution_policy<Derived>
+{
+  // allow conversion to tag
+  inline operator tag () const
+  {
+    return tag();
+  }
+};
+
+} // end detail
+
+// alias execution_policy and tag here
+using thrust::system::tbb::detail::execution_policy;
+using thrust::system::tbb::detail::tag;
+
+} // end tbb
+} // end system
+
+// alias items at top-level
+namespace tbb
+{
+
+using thrust::system::tbb::execution_policy;
+using thrust::system::tbb::tag;
+
+} // end tbb
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/extrema.h b/compat/thrust/system/tbb/detail/extrema.h
new file mode 100644
index 0000000..4715a89
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/extrema.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/detail/generic/extrema.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // tbb prefers generic::max_element to cpp::max_element
+  return thrust::system::detail::generic::max_element(exec, first, last, comp);
+} // end max_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first, 
+                            ForwardIterator last,
+                            BinaryPredicate comp)
+{
+  // tbb prefers generic::min_element to cpp::min_element
+  return thrust::system::detail::generic::min_element(exec, first, last, comp);
+} // end min_element()
+
+template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
+thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
+                                                             ForwardIterator first, 
+                                                             ForwardIterator last,
+                                                             BinaryPredicate comp)
+{
+  // tbb prefers generic::minmax_element to cpp::minmax_element
+  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
+} // end minmax_element()
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+
diff --git a/compat/thrust/system/tbb/detail/fill.h b/compat/thrust/system/tbb/detail/fill.h
new file mode 100644
index 0000000..5219e1c
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits fill
+#include <thrust/system/cpp/detail/fill.h>
+
diff --git a/compat/thrust/system/tbb/detail/find.h b/compat/thrust/system/tbb/detail/find.h
new file mode 100644
index 0000000..d351454
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/find.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/detail/generic/find.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template <typename DerivedPolicy, typename InputIterator, typename Predicate>
+InputIterator find_if(execution_policy<DerivedPolicy> &exec,
+                      InputIterator first,
+                      InputIterator last,
+                      Predicate pred)
+{
+  // tbb prefers generic::find_if to cpp::find_if
+  return thrust::system::detail::generic::find_if(exec, first, last, pred);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/for_each.h b/compat/thrust/system/tbb/detail/for_each.h
new file mode 100644
index 0000000..573bb81
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/for_each.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
+                                  RandomAccessIterator first,
+                                  Size n,
+                                  UnaryFunction f);
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/for_each.inl>
+
diff --git a/compat/thrust/system/tbb/detail/for_each.inl b/compat/thrust/system/tbb/detail/for_each.inl
new file mode 100644
index 0000000..b09c7be
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/for_each.inl
@@ -0,0 +1,100 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/system/detail/internal/scalar/for_each.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace for_each_detail
+{
+
+template<typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+  struct body
+{
+  RandomAccessIterator m_first;
+  UnaryFunction m_f;
+
+  body(RandomAccessIterator first, UnaryFunction f)
+    : m_first(first), m_f(f)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size> &r) const
+  {
+    // we assume that blocked_range specifies a contiguous range of integers
+    thrust::system::detail::internal::scalar::for_each_n(m_first + r.begin(), r.size(), m_f);
+  } // end operator()()
+}; // end body
+
+
+template<typename Size, typename RandomAccessIterator, typename UnaryFunction>
+  body<RandomAccessIterator,Size,UnaryFunction>
+    make_body(RandomAccessIterator first, UnaryFunction f)
+{
+  return body<RandomAccessIterator,Size,UnaryFunction>(first, f);
+} // end make_body()
+
+
+} // end for_each_detail
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename Size,
+         typename UnaryFunction>
+RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
+                                RandomAccessIterator first,
+                                Size n,
+                                UnaryFunction f)
+{
+  ::tbb::parallel_for(::tbb::blocked_range<Size>(0,n), for_each_detail::make_body<Size>(first,f));
+
+  // return the end of the range
+  return first + n;
+} // end for_each_n 
+
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename UnaryFunction>
+  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
+                                RandomAccessIterator first,
+                                RandomAccessIterator last,
+                                UnaryFunction f)
+{
+  return tbb::detail::for_each_n(s, first, thrust::distance(first,last), f);
+} // end for_each()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/gather.h b/compat/thrust/system/tbb/detail/gather.h
new file mode 100644
index 0000000..dfb7d7f
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/gather.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits gather
+#include <thrust/system/cpp/detail/gather.h>
+
diff --git a/compat/thrust/system/tbb/detail/generate.h b/compat/thrust/system/tbb/detail/generate.h
new file mode 100644
index 0000000..0cb33b9
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/generate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits generate
+#include <thrust/system/cpp/detail/generate.h>
+
diff --git a/compat/thrust/system/tbb/detail/get_value.h b/compat/thrust/system/tbb/detail/get_value.h
new file mode 100644
index 0000000..e376e65
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/get_value.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits get_value
+#include <thrust/system/cpp/detail/get_value.h>
+
diff --git a/compat/thrust/system/tbb/detail/inner_product.h b/compat/thrust/system/tbb/detail/inner_product.h
new file mode 100644
index 0000000..351421a
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/inner_product.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits inner_product
+#include <thrust/system/cpp/detail/inner_product.h>
+
diff --git a/compat/thrust/system/tbb/detail/iter_swap.h b/compat/thrust/system/tbb/detail/iter_swap.h
new file mode 100644
index 0000000..16176ec
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/iter_swap.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits iter_swap
+#include <thrust/system/cpp/detail/iter_swap.h>
+
diff --git a/compat/thrust/system/tbb/detail/logical.h b/compat/thrust/system/tbb/detail/logical.h
new file mode 100644
index 0000000..b2a80de
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/logical.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits logical
+#include <thrust/system/cpp/detail/logical.h>
+
diff --git a/compat/thrust/system/tbb/detail/malloc_and_free.h b/compat/thrust/system/tbb/detail/malloc_and_free.h
new file mode 100644
index 0000000..811a552
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/malloc_and_free.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits malloc and free
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+
diff --git a/compat/thrust/system/tbb/detail/memory.inl b/compat/thrust/system/tbb/detail/memory.inl
new file mode 100644
index 0000000..420a8a1
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/memory.inl
@@ -0,0 +1,110 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/tbb/memory.h>
+#include <thrust/system/cpp/memory.h>
+#include <limits>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+
+template<typename T>
+  template<typename OtherT>
+    reference<T> &
+      reference<T>
+        ::operator=(const reference<OtherT> &other)
+{
+  return super_t::operator=(other);
+} // end reference::operator=()
+
+template<typename T>
+  reference<T> &
+    reference<T>
+      ::operator=(const value_type &x)
+{
+  return super_t::operator=(x);
+} // end reference::operator=()
+
+template<typename T>
+__host__ __device__
+void swap(reference<T> a, reference<T> b)
+{
+  a.swap(b);
+} // end swap()
+
+namespace detail
+{
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
+//     is not defined
+//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  pointer<void> malloc_workaround(Tag t, std::size_t n)
+{
+  return pointer<void>(malloc(t, n));
+} // end malloc_workaround()
+
+// XXX circular #inclusion problems cause the compiler to believe that cpp::free
+//     is not defined
+//     WAR the problem by using adl to call cpp::free, which requires it to depend
+//     on a template parameter
+template<typename Tag>
+  void free_workaround(Tag t, pointer<void> ptr)
+{
+  free(t, ptr.get());
+} // end free_workaround()
+
+} // end detail
+
+inline pointer<void> malloc(std::size_t n)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // return pointer<void>(thrust::system::cpp::malloc(n))
+  //
+  return detail::malloc_workaround(cpp::tag(), n);
+} // end malloc()
+
+template<typename T>
+pointer<T> malloc(std::size_t n)
+{
+  pointer<void> raw_ptr = thrust::system::tbb::malloc(sizeof(T) * n);
+  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
+} // end malloc()
+
+inline void free(pointer<void> ptr)
+{
+  // XXX this is how we'd like to implement this function,
+  //     if not for circular #inclusion problems:
+  //
+  // thrust::system::cpp::free(ptr)
+  //
+  detail::free_workaround(cpp::tag(), ptr);
+} // end free()
+
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/merge.h b/compat/thrust/system/tbb/detail/merge.h
new file mode 100644
index 0000000..7b203ec
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/merge.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(execution_policy<ExecutionPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp);
+
+template <typename ExecutionPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(execution_policy<ExecutionPolicy> &exec,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first3,
+               InputIterator4 values_first4,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp);
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
+#include <thrust/system/tbb/detail/merge.inl>
+
diff --git a/compat/thrust/system/tbb/detail/merge.inl b/compat/thrust/system/tbb/detail/merge.inl
new file mode 100644
index 0000000..cc902af
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/merge.inl
@@ -0,0 +1,285 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/detail/internal/scalar/merge.h>
+#include <thrust/system/detail/internal/scalar/binary_search.h>
+#include <tbb/parallel_for.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace merge_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+struct range
+{
+  InputIterator1 first1, last1;
+  InputIterator2 first2, last2;
+  OutputIterator result;
+  StrictWeakOrdering comp;
+  size_t grain_size;
+
+  range(InputIterator1 first1, InputIterator1 last1,
+        InputIterator2 first2, InputIterator2 last2,
+        OutputIterator result,
+        StrictWeakOrdering comp,
+        size_t grain_size = 1024)
+    : first1(first1), last1(last1),
+      first2(first2), last2(last2),
+      result(result), comp(comp), grain_size(grain_size)
+  {}
+  
+  range(range& r, ::tbb::split)
+    : first1(r.first1), last1(r.last1),
+      first2(r.first2), last2(r.last2),
+      result(r.result), comp(r.comp), grain_size(r.grain_size)
+  {
+    // we can assume n1 and n2 are not both 0
+    size_t n1 = thrust::distance(first1, last1);
+    size_t n2 = thrust::distance(first2, last2);
+
+    InputIterator1 mid1 = first1;
+    InputIterator2 mid2 = first2;
+
+    if (n1 > n2)
+    {
+      mid1 += n1 / 2;
+      mid2 = thrust::system::detail::internal::scalar::lower_bound(first2, last2, raw_reference_cast(*mid1), comp);
+    }
+    else
+    {
+      mid2 += n2 / 2;
+      mid1 = thrust::system::detail::internal::scalar::upper_bound(first1, last1, raw_reference_cast(*mid2), comp);
+    }
+    
+    // set first range to [first1, mid1), [first2, mid2), result
+    r.last1 = mid1;
+    r.last2 = mid2;
+
+    // set second range to [mid1, last1), [mid2, last2), result + (mid1 - first1) + (mid2 - first2)
+    first1 = mid1;
+    first2 = mid2;
+    result += thrust::distance(r.first1, mid1) + thrust::distance(r.first2, mid2);
+  }
+
+  bool empty(void) const
+  {
+    return (first1 == last1) && (first2 == last2);
+  }
+
+  bool is_divisible(void) const
+  {
+    return static_cast<size_t>(thrust::distance(first1, last1) + thrust::distance(first2, last2)) > grain_size;
+  }
+};
+
+struct body
+{
+  template <typename Range>
+  void operator()(Range& r) const
+  {
+    thrust::system::detail::internal::scalar::merge
+      (r.first1, r.last1,
+       r.first2, r.last2,
+       r.result,
+       r.comp);
+  }
+};
+
+} // end namespace merge_detail
+
+namespace merge_by_key_detail
+{
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename InputIterator4,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename StrictWeakOrdering>
+struct range
+{
+  InputIterator1 keys_first1, keys_last1;
+  InputIterator2 keys_first2, keys_last2;
+  InputIterator3 values_first1;
+  InputIterator4 values_first2;
+  OutputIterator1 keys_result;
+  OutputIterator2 values_result;
+  StrictWeakOrdering comp;
+  size_t grain_size;
+
+  range(InputIterator1 keys_first1, InputIterator1 keys_last1,
+        InputIterator2 keys_first2, InputIterator2 keys_last2,
+        InputIterator3 values_first1,
+        InputIterator4 values_first2,
+        OutputIterator1 keys_result,
+        OutputIterator2 values_result,
+        StrictWeakOrdering comp,
+        size_t grain_size = 1024)
+    : keys_first1(keys_first1), keys_last1(keys_last1),
+      keys_first2(keys_first2), keys_last2(keys_last2),
+      values_first1(values_first1),
+      values_first2(values_first2),
+      keys_result(keys_result), values_result(values_result),
+      comp(comp), grain_size(grain_size)
+  {}
+  
+  range(range& r, ::tbb::split)
+    : keys_first1(r.keys_first1), keys_last1(r.keys_last1),
+      keys_first2(r.keys_first2), keys_last2(r.keys_last2),
+      values_first1(r.values_first1),
+      values_first2(r.values_first2),
+      keys_result(r.keys_result), values_result(r.values_result),
+      comp(r.comp), grain_size(r.grain_size)
+  {
+    // we can assume n1 and n2 are not both 0
+    size_t n1 = thrust::distance(keys_first1, keys_last1);
+    size_t n2 = thrust::distance(keys_first2, keys_last2);
+
+    InputIterator1 mid1 = keys_first1;
+    InputIterator2 mid2 = keys_first2;
+
+    if (n1 > n2)
+    {
+      mid1 += n1 / 2;
+      mid2 = thrust::system::detail::internal::scalar::lower_bound(keys_first2, keys_last2, raw_reference_cast(*mid1), comp);
+    }
+    else
+    {
+      mid2 += n2 / 2;
+      mid1 = thrust::system::detail::internal::scalar::upper_bound(keys_first1, keys_last1, raw_reference_cast(*mid2), comp);
+    }
+    
+    // set first range to [keys_first1, mid1), [keys_first2, mid2), keys_result, values_result
+    r.keys_last1 = mid1;
+    r.keys_last2 = mid2;
+
+    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2) 
+    keys_first1 = mid1;
+    keys_first2 = mid2;
+    values_first1 += thrust::distance(r.keys_first1, mid1);
+    values_first2 += thrust::distance(r.keys_first2, mid2);
+    keys_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
+    values_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
+  }
+
+  bool empty(void) const
+  {
+    return (keys_first1 == keys_last1) && (keys_first2 == keys_last2);
+  }
+
+  bool is_divisible(void) const
+  {
+    return static_cast<size_t>(thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2)) > grain_size;
+  }
+};
+
+struct body
+{
+  template <typename Range>
+  void operator()(Range& r) const
+  {
+    thrust::system::detail::internal::scalar::merge_by_key
+      (r.keys_first1, r.keys_last1,
+       r.keys_first2, r.keys_last2,
+       r.values_first1,
+       r.values_first2,
+       r.keys_result,
+       r.values_result,
+       r.comp);
+  }
+};
+
+} // end namespace merge_by_key_detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename StrictWeakOrdering>
+OutputIterator merge(execution_policy<DerivedPolicy> &exec,
+                     InputIterator1 first1,
+                     InputIterator1 last1,
+                     InputIterator2 first2,
+                     InputIterator2 last2,
+                     OutputIterator result,
+                     StrictWeakOrdering comp)
+{
+  typedef typename merge_detail::range<InputIterator1,InputIterator2,OutputIterator,StrictWeakOrdering> Range;
+  typedef          merge_detail::body                                                                   Body;
+  Range range(first1, last1, first2, last2, result, comp);
+  Body  body;
+
+  ::tbb::parallel_for(range, body);
+
+  thrust::advance(result, thrust::distance(first1, last1) + thrust::distance(first2, last2));
+
+  return result;
+} // end merge()
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename InputIterator3,
+          typename InputIterator4,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename StrictWeakOrdering>
+thrust::pair<OutputIterator1,OutputIterator2>
+  merge_by_key(execution_policy<DerivedPolicy> &exec,
+               InputIterator1 keys_first1,
+               InputIterator1 keys_last1,
+               InputIterator2 keys_first2,
+               InputIterator2 keys_last2,
+               InputIterator3 values_first3,
+               InputIterator4 values_first4,
+               OutputIterator1 keys_result,
+               OutputIterator2 values_result,
+               StrictWeakOrdering comp)
+{
+  typedef typename merge_by_key_detail::range<InputIterator1,InputIterator2,InputIterator3,InputIterator4,OutputIterator1,OutputIterator2,StrictWeakOrdering> Range;
+  typedef          merge_by_key_detail::body                                                                                                                  Body;
+
+  Range range(keys_first1, keys_last1, keys_first2, keys_last2, values_first3, values_first4, keys_result, values_result, comp);
+  Body  body;
+
+  ::tbb::parallel_for(range, body);
+
+  thrust::advance(keys_result,   thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
+  thrust::advance(values_result, thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
+
+  return thrust::make_pair(keys_result,values_result);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/mismatch.h b/compat/thrust/system/tbb/detail/mismatch.h
new file mode 100644
index 0000000..03980cf
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/mismatch.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits mismatch
+#include <thrust/system/cpp/detail/mismatch.h>
+
diff --git a/compat/thrust/system/tbb/detail/par.h b/compat/thrust/system/tbb/detail/par.h
new file mode 100644
index 0000000..74801ab
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/par.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/detail/execute_with_allocator.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+struct par_t : thrust::system::tbb::detail::execution_policy<par_t>
+{
+  par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
+
+  template<typename Allocator>
+    thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>
+      operator()(Allocator &alloc) const
+  {
+    return thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>(alloc);
+  }
+};
+
+
+} // end detail
+
+
+static const detail::par_t par;
+
+
+} // end tbb
+} // end system
+
+
+// alias par here
+namespace tbb
+{
+
+
+using thrust::system::tbb::par;
+
+
+} // end tbb
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/partition.h b/compat/thrust/system/tbb/detail/partition.h
new file mode 100644
index 0000000..af37121
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/partition.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/partition.inl>
+
diff --git a/compat/thrust/system/tbb/detail/partition.inl b/compat/thrust/system/tbb/detail/partition.inl
new file mode 100644
index 0000000..1e421e1
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/partition.inl
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/partition.h>
+#include <thrust/system/detail/generic/partition.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   Predicate pred)
+{
+  // tbb prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
+} // end stable_partition()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
+                                   ForwardIterator first,
+                                   ForwardIterator last,
+                                   InputIterator stencil,
+                                   Predicate pred)
+{
+  // tbb prefers generic::stable_partition to cpp::stable_partition
+  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
+} // end stable_partition()
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator first,
+                          InputIterator last,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename Predicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
+                          InputIterator1 first,
+                          InputIterator1 last,
+                          InputIterator2 stencil,
+                          OutputIterator1 out_true,
+                          OutputIterator2 out_false,
+                          Predicate pred)
+{
+  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
+  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
+} // end stable_partition_copy()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/reduce.h b/compat/thrust/system/tbb/detail/reduce.h
new file mode 100644
index 0000000..83a7cc3
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reduce.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file reduce.h
+ *  \brief TBB implementation of reduce.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/reduce.inl>
+
diff --git a/compat/thrust/system/tbb/detail/reduce.inl b/compat/thrust/system/tbb/detail/reduce.inl
new file mode 100644
index 0000000..c249852
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reduce.inl
@@ -0,0 +1,131 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/reduce.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_detail
+{
+
+template<typename RandomAccessIterator,
+         typename OutputType,
+         typename BinaryFunction>
+struct body
+{
+  RandomAccessIterator first;
+  OutputType sum;
+  bool first_call;  // TBB can invoke operator() multiple times on the same body
+  thrust::detail::host_function<BinaryFunction,OutputType> binary_op;
+
+  // note: we only initalize sum with init to avoid calling OutputType's default constructor
+  body(RandomAccessIterator first, OutputType init, BinaryFunction binary_op)
+    : first(first), sum(init), first_call(true), binary_op(binary_op)
+  {}
+
+  // note: we only initalize sum with b.sum to avoid calling OutputType's default constructor
+  body(body& b, ::tbb::split)
+    : first(b.first), sum(b.sum), first_call(true), binary_op(b.binary_op)
+  {}
+
+  template <typename Size>
+  void operator()(const ::tbb::blocked_range<Size> &r)
+  {
+    // we assume that blocked_range specifies a contiguous range of integers
+    
+    if (r.empty()) return; // nothing to do
+
+    RandomAccessIterator iter = first + r.begin();
+
+    OutputType temp = thrust::raw_reference_cast(*iter);
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+
+    if (first_call)
+    {
+      // first time body has been invoked
+      first_call = false;
+      sum = temp;
+    }
+    else
+    {
+      // body has been previously invoked, accumulate temp into sum
+      sum = binary_op(sum, temp);
+    }
+  } // end operator()()
+  
+  void join(body& b)
+  {
+    sum = binary_op(sum, b.sum);
+  }
+}; // end body
+
+} // end reduce_detail
+
+
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+                    InputIterator begin,
+                    InputIterator end,
+                    OutputType init,
+                    BinaryFunction binary_op)
+{
+  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
+
+  Size n = thrust::distance(begin, end);
+
+  if (n == 0)
+  {
+    return init;
+  }
+  else
+  {
+    typedef typename reduce_detail::body<InputIterator,OutputType,BinaryFunction> Body;
+    Body reduce_body(begin, init, binary_op);
+    ::tbb::parallel_reduce(::tbb::blocked_range<Size>(0,n), reduce_body);
+    return binary_op(init, reduce_body.sum);
+  }
+}
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/reduce_by_key.h b/compat/thrust/system/tbb/detail/reduce_by_key.h
new file mode 100644
index 0000000..0149a76
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reduce_by_key.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    reduce_by_key(execution_policy<DerivedPolicy> &exec,
+                  InputIterator1 keys_first, 
+                  InputIterator1 keys_last,
+                  InputIterator2 values_first,
+                  OutputIterator1 keys_output,
+                  OutputIterator2 values_output,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/reduce_by_key.inl>
+
diff --git a/compat/thrust/system/tbb/detail/reduce_by_key.inl b/compat/thrust/system/tbb/detail/reduce_by_key.inl
new file mode 100644
index 0000000..10d2d8b
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reduce_by_key.inl
@@ -0,0 +1,344 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/system/cpp/execution_policy.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/system/tbb/detail/reduce_intervals.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/range/tail_flags.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/tbb_thread.h>
+#include <cassert>
+
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_by_key_detail
+{
+
+
+template<typename L, typename R>
+  inline L divide_ri(const L x, const R y)
+{
+  return (x + (y - 1)) / y;
+}
+
+
+template<typename InputIterator, typename BinaryFunction, typename OutputIterator = void>
+  struct partial_sum_type
+    : thrust::detail::eval_if<
+        thrust::detail::has_result_type<BinaryFunction>::value,
+        thrust::detail::result_type<BinaryFunction>,
+        thrust::detail::eval_if<
+          thrust::detail::is_output_iterator<OutputIterator>::value,
+          thrust::iterator_value<InputIterator>,
+          thrust::iterator_value<OutputIterator>
+        >
+      >
+{};
+
+
+template<typename InputIterator, typename BinaryFunction>
+  struct partial_sum_type<InputIterator,BinaryFunction,void>
+    : thrust::detail::eval_if<
+        thrust::detail::has_result_type<BinaryFunction>::value,
+        thrust::detail::result_type<BinaryFunction>,
+        thrust::iterator_value<InputIterator>
+      >
+{};
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::pair<
+    InputIterator1,
+    thrust::pair<
+      typename InputIterator1::value_type,
+      typename partial_sum_type<InputIterator2,BinaryFunction>::type
+    >
+  >
+    reduce_last_segment_backward(InputIterator1 keys_first,
+                                 InputIterator1 keys_last,
+                                 InputIterator2 values_first,
+                                 BinaryPredicate binary_pred,
+                                 BinaryFunction binary_op)
+{
+  typename thrust::iterator_difference<InputIterator1>::type n = keys_last - keys_first;
+
+  // reverse the ranges and consume from the end
+  thrust::reverse_iterator<InputIterator1> keys_first_r(keys_last);
+  thrust::reverse_iterator<InputIterator1> keys_last_r(keys_first);
+  thrust::reverse_iterator<InputIterator2> values_first_r(values_first + n);
+
+  typename InputIterator1::value_type result_key = *keys_first_r;
+  typename partial_sum_type<InputIterator2,BinaryFunction>::type result_value = *values_first_r;
+
+  // consume the entirety of the first key's sequence
+  for(++keys_first_r, ++values_first_r;
+      (keys_first_r != keys_last_r) && binary_pred(*keys_first_r, result_key);
+      ++keys_first_r, ++values_first_r)
+  {
+    result_value = binary_op(result_value, *values_first_r);
+  }
+
+  return thrust::make_pair(keys_first_r.base(), thrust::make_pair(result_key, result_value));
+}
+
+
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate,
+         typename BinaryFunction>
+  thrust::tuple<
+    OutputIterator1,
+    OutputIterator2,
+    typename InputIterator1::value_type,
+    typename partial_sum_type<InputIterator2,BinaryFunction>::type
+  >
+    reduce_by_key_with_carry(InputIterator1 keys_first, 
+                             InputIterator1 keys_last,
+                             InputIterator2 values_first,
+                             OutputIterator1 keys_output,
+                             OutputIterator2 values_output,
+                             BinaryPredicate binary_pred,
+                             BinaryFunction binary_op)
+{
+  // first, consume the last sequence to produce the carry
+  // XXX is there an elegant way to pose this such that we don't need to default construct carry?
+  thrust::pair<
+    typename InputIterator1::value_type,
+    typename partial_sum_type<InputIterator2,BinaryFunction>::type
+  > carry;
+
+  thrust::tie(keys_last, carry) = reduce_last_segment_backward(keys_first, keys_last, values_first, binary_pred, binary_op);
+
+  // finish with sequential reduce_by_key
+  thrust::cpp::tag seq;
+  thrust::tie(keys_output, values_output) =
+    thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
+  
+  return thrust::make_tuple(keys_output, values_output, carry.first, carry.second);
+}
+
+
+template<typename Iterator>
+  bool interval_has_carry(size_t interval_idx, size_t interval_size, size_t num_intervals, Iterator tail_flags)
+{
+  // to discover whether the interval has a carry, look at the tail_flag corresponding to its last element 
+  // the final interval never has a carry by definition
+  return (interval_idx + 1 < num_intervals) ? !tail_flags[(interval_idx + 1) * interval_size - 1] : false;
+}
+
+
+template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
+  struct serial_reduce_by_key_body
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type size_type;
+
+  Iterator1 keys_first;
+  Iterator2 values_first;
+  Iterator3 result_offset;
+  Iterator4 keys_result;
+  Iterator5 values_result;
+  Iterator6 carry_result;
+
+  size_type n;
+  size_type interval_size;
+  size_type num_intervals;
+
+  BinaryPredicate binary_pred;
+  BinaryFunction binary_op;
+
+  serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, size_type n, size_type interval_size, size_type num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
+    : keys_first(keys_first), values_first(values_first),
+      result_offset(result_offset),
+      keys_result(keys_result),
+      values_result(values_result),
+      carry_result(carry_result),
+      n(n),
+      interval_size(interval_size),
+      num_intervals(num_intervals),
+      binary_pred(binary_pred),
+      binary_op(binary_op)
+  {}
+
+  void operator()(const ::tbb::blocked_range<size_type> &r) const
+  {
+    assert(r.size() == 1);
+
+    const size_type interval_idx = r.begin();
+
+    const size_type offset_to_first = interval_size * interval_idx;
+    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
+
+    Iterator1 my_keys_first     = keys_first    + offset_to_first;
+    Iterator1 my_keys_last      = keys_first    + offset_to_last;
+    Iterator2 my_values_first   = values_first  + offset_to_first;
+    Iterator3 my_result_offset  = result_offset + interval_idx;
+    Iterator4 my_keys_result    = keys_result   + *my_result_offset;
+    Iterator5 my_values_result  = values_result + *my_result_offset;
+    Iterator6 my_carry_result   = carry_result  + interval_idx;
+
+    // consume the rest of the interval with reduce_by_key
+    typedef typename thrust::iterator_value<Iterator1>::type key_type;
+    typedef typename partial_sum_type<Iterator2,BinaryFunction>::type value_type;
+
+    // XXX is there a way to pose this so that we don't require default construction of carry?
+    thrust::pair<key_type, value_type> carry;
+
+    thrust::tie(my_keys_result, my_values_result, carry.first, carry.second) =
+      reduce_by_key_with_carry(my_keys_first,
+                               my_keys_last,
+                               my_values_first,
+                               my_keys_result,
+                               my_values_result,
+                               binary_pred,
+                               binary_op);
+
+    // store to carry only when we actually have a carry
+    // store to my_keys_result & my_values_result otherwise
+    
+    // create tail_flags so we can check for a carry
+    thrust::detail::tail_flags<Iterator1,BinaryPredicate> flags = thrust::detail::make_tail_flags(keys_first, keys_first + n, binary_pred);
+
+    if(interval_has_carry(interval_idx, interval_size, num_intervals, flags.begin()))
+    {
+      // we can ignore the carry's key
+      // XXX because the carry result is uninitialized, we should copy construct
+      *my_carry_result = carry.second;
+    }
+    else
+    {
+      *my_keys_result = carry.first;
+      *my_values_result = carry.second;
+    }
+  }
+};
+
+
+template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
+  serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>
+    make_serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, typename thrust::iterator_difference<Iterator1>::type n, size_t interval_size, size_t num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
+{
+  return serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>(keys_first, values_first, result_offset, keys_result, values_result, carry_result, n, interval_size, num_intervals, binary_pred, binary_op);
+}
+
+
+} // end reduce_by_key_detail
+
+
+template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename BinaryFunction>
+  thrust::pair<Iterator3,Iterator4>
+    reduce_by_key(thrust::tbb::execution_policy<DerivedPolicy> &exec,
+                  Iterator1 keys_first, Iterator1 keys_last, 
+                  Iterator2 values_first,
+                  Iterator3 keys_result,
+                  Iterator4 values_result,
+                  BinaryPredicate binary_pred,
+                  BinaryFunction binary_op)
+{
+
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+  difference_type n = keys_last - keys_first;
+  if(n == 0) return thrust::make_pair(keys_result, values_result);
+
+  // XXX this value is a tuning opportunity
+  const difference_type parallelism_threshold = 10000;
+
+  if(n < parallelism_threshold)
+  {
+    // don't bother parallelizing for small n
+    thrust::cpp::tag seq;
+    return thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
+  }
+
+  // count the number of processors
+  const unsigned int p = thrust::max<unsigned int>(1u, ::tbb::tbb_thread::hardware_concurrency());
+
+  // generate O(P) intervals of sequential work
+  // XXX oversubscribing is a tuning opportunity
+  const unsigned int subscription_rate = 1;
+  difference_type interval_size = thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p)));
+  difference_type num_intervals = reduce_by_key_detail::divide_ri(n, interval_size);
+
+  // decompose the input into intervals of size N / num_intervals
+  // add one extra element to this vector to store the size of the entire result
+  thrust::detail::temporary_array<difference_type, DerivedPolicy> interval_output_offsets(0, exec, num_intervals + 1);
+
+  // first count the number of tail flags in each interval
+  thrust::detail::tail_flags<Iterator1,BinaryPredicate> tail_flags = thrust::detail::make_tail_flags(keys_first, keys_last, binary_pred);
+  thrust::system::tbb::detail::reduce_intervals(exec, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus<size_t>());
+  interval_output_offsets[0] = 0;
+
+  // scan the counts to get each body's output offset
+  thrust::cpp::tag seq;
+  thrust::inclusive_scan(seq,
+                         interval_output_offsets.begin() + 1, interval_output_offsets.end(), 
+                         interval_output_offsets.begin() + 1);
+
+  // do a reduce_by_key serially in each thread
+  // the final interval never has a carry by definition, so don't reserve space for it
+  typedef typename reduce_by_key_detail::partial_sum_type<Iterator2,BinaryFunction>::type carry_type;
+  thrust::detail::temporary_array<carry_type, DerivedPolicy> carries(0, exec, num_intervals - 1);
+
+  // force grainsize == 1 with simple_partioner()
+  ::tbb::parallel_for(::tbb::blocked_range<difference_type>(0, num_intervals, 1),
+    reduce_by_key_detail::make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op),
+    ::tbb::simple_partitioner());
+
+  difference_type size_of_result = interval_output_offsets[num_intervals];
+
+  // sequentially accumulate the carries
+  // note that the last interval does not have a carry
+  // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key
+  for(typename thrust::detail::temporary_array<carry_type,DerivedPolicy>::size_type i = 0; i < carries.size(); ++i)
+  {
+    // if our interval has a carry, then we need to sum the carry to the next interval's output offset
+    // if it does not have a carry, then we need to ignore carry_value[i]
+    if(reduce_by_key_detail::interval_has_carry(i, interval_size, num_intervals, tail_flags.begin()))
+    {
+      difference_type output_idx = interval_output_offsets[i+1];
+
+      values_result[output_idx] = binary_op(values_result[output_idx], carries[i]);
+    }
+  }
+
+  return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result);
+}
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/reduce_intervals.h b/compat/thrust/system/tbb/detail/reduce_intervals.h
new file mode 100644
index 0000000..0647ffd
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reduce_intervals.h
@@ -0,0 +1,126 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+#include <tbb/parallel_for.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/system/cpp/memory.h>
+#include <thrust/reduce.h>
+#include <cassert>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace reduce_intervals_detail
+{
+
+
+template<typename L, typename R>
+  inline L divide_ri(const L x, const R y)
+{
+  return (x + (y - 1)) / y;
+}
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
+  struct body
+{
+  RandomAccessIterator1 first;
+  RandomAccessIterator2 result;
+  Size n, interval_size;
+  BinaryFunction binary_op;
+
+  body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
+    : first(first), result(result), n(n), interval_size(interval_size), binary_op(binary_op)
+  {}
+
+  void operator()(const ::tbb::blocked_range<Size> &r) const
+  {
+    assert(r.size() == 1);
+
+    Size interval_idx = r.begin();
+
+    Size offset_to_first = interval_size * interval_idx;
+    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
+
+    RandomAccessIterator1 my_first = first + offset_to_first;
+    RandomAccessIterator1 my_last  = first + offset_to_last;
+
+    thrust::cpp::tag seq;
+
+    // carefully pass the init value for the interval with raw_reference_cast
+    typedef typename BinaryFunction::result_type sum_type;
+    result[interval_idx] =
+      thrust::reduce(seq, my_first + 1, my_last, sum_type(thrust::raw_reference_cast(*my_first)), binary_op);
+  }
+};
+
+
+template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
+  body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>
+    make_body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
+{
+  return body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>(first, result, n, interval_size, binary_op);
+}
+
+
+} // end reduce_intervals_detail
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename BinaryFunction>
+  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &,
+                        RandomAccessIterator1 first,
+                        RandomAccessIterator1 last,
+                        Size interval_size,
+                        RandomAccessIterator2 result,
+                        BinaryFunction binary_op)
+{
+  typename thrust::iterator_difference<RandomAccessIterator1>::type n = last - first;
+
+  Size num_intervals = reduce_intervals_detail::divide_ri(n, interval_size);
+
+  ::tbb::parallel_for(::tbb::blocked_range<Size>(0, num_intervals, 1), reduce_intervals_detail::make_body(first, result, Size(n), interval_size, binary_op), ::tbb::simple_partitioner());
+}
+
+
+template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
+  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &exec,
+                        RandomAccessIterator1 first,
+                        RandomAccessIterator1 last,
+                        Size interval_size,
+                        RandomAccessIterator2 result)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
+
+  return thrust::system::tbb::detail::reduce_intervals(exec, first, last, interval_size, result, thrust::plus<value_type>());
+}
+
+
+} // end detail
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/detail/remove.h b/compat/thrust/system/tbb/detail/remove.h
new file mode 100644
index 0000000..48cbb5c
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/remove.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/omp/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace omp
+{
+namespace detail
+{
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred);
+
+
+} // end namespace detail
+} // end namespace omp
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/remove.inl>
+
diff --git a/compat/thrust/system/tbb/detail/remove.inl b/compat/thrust/system/tbb/detail/remove.inl
new file mode 100644
index 0000000..01916c5
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/remove.inl
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/remove.h>
+#include <thrust/system/detail/generic/remove.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            Predicate pred)
+{
+  // tbb prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename InputIterator,
+         typename Predicate>
+  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
+                            ForwardIterator first,
+                            ForwardIterator last,
+                            InputIterator stencil,
+                            Predicate pred)
+{
+  // tbb prefers generic::remove_if to cpp::remove_if
+  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
+}
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
+}
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename Predicate>
+  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
+                                InputIterator1 first,
+                                InputIterator1 last,
+                                InputIterator2 stencil,
+                                OutputIterator result,
+                                Predicate pred)
+{
+  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
+  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/replace.h b/compat/thrust/system/tbb/detail/replace.h
new file mode 100644
index 0000000..c48555d
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/replace.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/compat/thrust/system/tbb/detail/reverse.h b/compat/thrust/system/tbb/detail/reverse.h
new file mode 100644
index 0000000..04923d1
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/reverse.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits reverse
+#include <thrust/system/cpp/detail/reverse.h>
+
diff --git a/compat/thrust/system/tbb/detail/scan.h b/compat/thrust/system/tbb/detail/scan.h
new file mode 100644
index 0000000..ed5cacd
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/scan.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file scan.h
+ *  \brief TBB implementations of scan functions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op);
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op);
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/scan.inl>
+
diff --git a/compat/thrust/system/tbb/detail/scan.inl b/compat/thrust/system/tbb/detail/scan.inl
new file mode 100644
index 0000000..4887824
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/scan.inl
@@ -0,0 +1,293 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/scan.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_scan.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace scan_detail
+{
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction,
+         typename ValueType>
+struct inclusive_body
+{
+  InputIterator input;
+  OutputIterator output;
+  thrust::detail::host_function<BinaryFunction,ValueType> binary_op;
+  ValueType sum;
+  bool first_call;
+
+  inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy)
+    : input(input), output(output), binary_op(binary_op), sum(dummy), first_call(true)
+  {}
+    
+  inclusive_body(inclusive_body& b, ::tbb::split)
+    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
+  {}
+
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator iter = input + r.begin();
+ 
+    ValueType temp = *iter;
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+    if (first_call)
+      sum = temp;
+    else
+      sum = binary_op(sum, temp);
+      
+    first_call = false;
+  }
+  
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator  iter1 = input  + r.begin();
+    OutputIterator iter2 = output + r.begin();
+
+    if (first_call)
+    {
+      *iter2 = sum = *iter1;
+      ++iter1;
+      ++iter2;
+      for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2)
+        *iter2 = sum = binary_op(sum, *iter1);
+    }
+    else
+    {
+      for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+        *iter2 = sum = binary_op(sum, *iter1);
+    }
+
+    first_call = false;
+  }
+
+  void reverse_join(inclusive_body& b)
+  {
+    sum = binary_op(b.sum, sum);
+  } 
+
+  void assign(inclusive_body& b)
+  {
+    sum = b.sum;
+  } 
+};
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction,
+         typename ValueType>
+struct exclusive_body
+{
+  InputIterator input;
+  OutputIterator output;
+  thrust::detail::host_function<BinaryFunction,ValueType> binary_op;
+  ValueType sum;
+  bool first_call;
+
+  exclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init)
+    : input(input), output(output), binary_op(binary_op), sum(init), first_call(true)
+  {}
+    
+  exclusive_body(exclusive_body& b, ::tbb::split)
+    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
+  {}
+
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
+  {
+    InputIterator iter = input + r.begin();
+ 
+    ValueType temp = *iter;
+
+    ++iter;
+
+    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
+      temp = binary_op(temp, *iter);
+
+    if (first_call && r.begin() > 0)
+      sum = temp;
+    else
+      sum = binary_op(sum, temp);
+      
+    first_call = false;
+  }
+  
+  template<typename Size> 
+  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
+  {
+    InputIterator  iter1 = input  + r.begin();
+    OutputIterator iter2 = output + r.begin();
+
+    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
+    {
+      ValueType temp = binary_op(sum, *iter1);
+      *iter2 = sum;
+      sum = temp;
+    }
+    
+    first_call = false;
+  }
+
+  void reverse_join(exclusive_body& b)
+  {
+    sum = binary_op(b.sum, sum);
+  } 
+
+  void assign(exclusive_body& b)
+  {
+    sum = b.sum;
+  } 
+};
+
+} // end scan_detail
+
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator inclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                BinaryFunction binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+  
+  using namespace thrust::detail;
+
+  typedef typename eval_if<
+    has_result_type<BinaryFunction>::value,
+    result_type<BinaryFunction>,
+    eval_if<
+      is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+  
+  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
+  
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    typedef typename scan_detail::inclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
+    Body scan_body(first, result, binary_op, *first);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
+  }
+ 
+  thrust::advance(result, n);
+
+  return result;
+}
+
+
+template<typename InputIterator,
+         typename OutputIterator,
+         typename T,
+         typename BinaryFunction>
+  OutputIterator exclusive_scan(tag,
+                                InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T init,
+                                BinaryFunction binary_op)
+{
+  // the pseudocode for deducing the type of the temporary used below:
+  // 
+  // if BinaryFunction is AdaptableBinaryFunction
+  //   TemporaryType = AdaptableBinaryFunction::result_type
+  // else if OutputIterator is a "pure" output iterator
+  //   TemporaryType = InputIterator::value_type
+  // else
+  //   TemporaryType = OutputIterator::value_type
+  //
+  // XXX upon c++0x, TemporaryType needs to be:
+  // result_of<BinaryFunction>::type
+
+  using namespace thrust::detail;
+
+  typedef typename eval_if<
+    has_result_type<BinaryFunction>::value,
+    result_type<BinaryFunction>,
+    eval_if<
+      is_output_iterator<OutputIterator>::value,
+      thrust::iterator_value<InputIterator>,
+      thrust::iterator_value<OutputIterator>
+    >
+  >::type ValueType;
+
+  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
+  
+  Size n = thrust::distance(first, last);
+
+  if (n != 0)
+  {
+    typedef typename scan_detail::exclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
+    Body scan_body(first, result, binary_op, init);
+    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
+  }
+ 
+  thrust::advance(result, n);
+
+  return result;
+} 
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/scan_by_key.h b/compat/thrust/system/tbb/detail/scan_by_key.h
new file mode 100644
index 0000000..cad4fc1
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/scan_by_key.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits scan_by_key
+#include <thrust/system/cpp/detail/scan_by_key.h>
+
diff --git a/compat/thrust/system/tbb/detail/scatter.h b/compat/thrust/system/tbb/detail/scatter.h
new file mode 100644
index 0000000..c48555d
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/scatter.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits this algorithm
+#include <thrust/system/cpp/detail/scatter.h>
+
diff --git a/compat/thrust/system/tbb/detail/sequence.h b/compat/thrust/system/tbb/detail/sequence.h
new file mode 100644
index 0000000..811d8f5
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/sequence.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits sequence
+#include <thrust/system/cpp/detail/sequence.h>
+
diff --git a/compat/thrust/system/tbb/detail/set_operations.h b/compat/thrust/system/tbb/detail/set_operations.h
new file mode 100644
index 0000000..687edb2
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/set_operations.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits set_operations
+#include <thrust/system/cpp/detail/set_operations.h>
+
diff --git a/compat/thrust/system/tbb/detail/sort.h b/compat/thrust/system/tbb/detail/sort.h
new file mode 100644
index 0000000..3b6f630
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/sort.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+  void stable_sort(execution_policy<DerivedPolicy> &exec,
+                   RandomAccessIterator first,
+                   RandomAccessIterator last,
+                   StrictWeakOrdering comp);
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 keys_first,
+                          RandomAccessIterator1 keys_last,
+                          RandomAccessIterator2 values_first,
+                          StrictWeakOrdering comp);
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/sort.inl>
+
diff --git a/compat/thrust/system/tbb/detail/sort.inl b/compat/thrust/system/tbb/detail/sort.inl
new file mode 100644
index 0000000..f292789
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/sort.inl
@@ -0,0 +1,251 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/copy.h>
+#include <thrust/system/detail/internal/scalar/sort.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/merge.h>
+#include <tbb/parallel_invoke.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+namespace sort_detail
+{
+
+// TODO tune this based on data type and comp
+const static int threshold = 128 * 1024;
+  
+template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace);
+
+template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+struct merge_sort_closure
+{
+  execution_policy<DerivedPolicy> &exec;
+  Iterator1 first1, last1;
+  Iterator2 first2;
+  StrictWeakOrdering comp;
+  bool inplace;
+
+  merge_sort_closure(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
+    : exec(exec), first1(first1), last1(last1), first2(first2), comp(comp), inplace(inplace)
+  {}
+
+  void operator()(void) const
+  {
+    merge_sort(exec, first1, last1, first2, comp, inplace);
+  }
+};
+
+
+template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
+void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+
+  difference_type n = thrust::distance(first1, last1);
+
+  if (n < threshold)
+  {
+    thrust::system::detail::internal::scalar::stable_sort(first1, last1, comp);
+    
+    if (!inplace)
+      thrust::system::detail::internal::scalar::copy(first1, last1, first2);
+
+    return;
+  }
+
+  Iterator1 mid1  = first1 + (n / 2);
+  Iterator2 mid2  = first2 + (n / 2);
+  Iterator2 last2 = first2 + n;
+
+  typedef merge_sort_closure<DerivedPolicy,Iterator1,Iterator2,StrictWeakOrdering> Closure;
+  
+  Closure left (exec, first1, mid1,  first2, comp, !inplace);
+  Closure right(exec, mid1,   last1, mid2,   comp, !inplace);
+
+  ::tbb::parallel_invoke(left, right);
+
+  if (inplace) thrust::merge(exec, first2, mid2, mid2, last2, first1, comp);
+  else	       thrust::merge(exec, first1, mid1, mid1, last1, first2, comp);
+}
+
+} // end namespace sort_detail
+
+
+namespace sort_by_key_detail
+{
+
+// TODO tune this based on data type and comp
+const static int threshold = 128 * 1024;
+  
+template <typename DerivedPolicy,
+          typename Iterator1,
+          typename Iterator2,
+          typename Iterator3,
+          typename Iterator4,
+          typename StrictWeakOrdering>
+void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       Iterator1 first1,
+                       Iterator1 last1,
+                       Iterator2 first2,
+                       Iterator3 first3,
+                       Iterator4 first4,
+                       StrictWeakOrdering comp,
+                       bool inplace);
+
+template <typename DerivedPolicy,
+          typename Iterator1,
+          typename Iterator2,
+          typename Iterator3,
+          typename Iterator4,
+          typename StrictWeakOrdering>
+struct merge_sort_by_key_closure
+{
+  execution_policy<DerivedPolicy> &exec;
+  Iterator1 first1, last1;
+  Iterator2 first2;
+  Iterator3 first3;
+  Iterator4 first4;
+  StrictWeakOrdering comp;
+  bool inplace;
+
+  merge_sort_by_key_closure(execution_policy<DerivedPolicy> &exec,
+                            Iterator1 first1,
+                            Iterator1 last1,
+                            Iterator2 first2,
+                            Iterator3 first3,
+                            Iterator4 first4,
+                            StrictWeakOrdering comp,
+                            bool inplace)
+    : exec(exec), first1(first1), last1(last1), first2(first2), first3(first3), first4(first4), comp(comp), inplace(inplace)
+  {}
+
+  void operator()(void) const
+  {
+    merge_sort_by_key(exec, first1, last1, first2, first3, first4, comp, inplace);
+  }
+};
+
+
+template <typename DerivedPolicy,
+          typename Iterator1,
+          typename Iterator2,
+          typename Iterator3,
+          typename Iterator4,
+          typename StrictWeakOrdering>
+void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                       Iterator1 first1,
+                       Iterator1 last1,
+                       Iterator2 first2,
+                       Iterator3 first3,
+                       Iterator4 first4,
+                       StrictWeakOrdering comp,
+                       bool inplace)
+{
+  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
+
+  difference_type n = thrust::distance(first1, last1);
+  
+  Iterator1 mid1  = first1 + (n / 2);
+  Iterator2 mid2  = first2 + (n / 2);
+  Iterator3 mid3  = first3 + (n / 2);
+  Iterator4 mid4  = first4 + (n / 2);
+  Iterator2 last2 = first2 + n;
+  Iterator3 last3 = first3 + n;
+
+  if (n < threshold)
+  {
+    thrust::system::detail::internal::scalar::stable_sort_by_key(first1, last1, first2, comp);
+    
+    if (!inplace)
+    {
+      thrust::system::detail::internal::scalar::copy(first1, last1, first3);
+      thrust::system::detail::internal::scalar::copy(first2, last2, first4);
+    }
+
+    return;
+  }
+
+  typedef merge_sort_by_key_closure<DerivedPolicy,Iterator1,Iterator2,Iterator3,Iterator4,StrictWeakOrdering> Closure;
+  
+  Closure left (exec, first1, mid1,  first2, first3, first4, comp, !inplace);
+  Closure right(exec, mid1,   last1, mid2,   mid3,   mid4,   comp, !inplace);
+
+  ::tbb::parallel_invoke(left, right);
+
+  if(inplace)
+  {
+    thrust::merge_by_key(exec, first3, mid3, mid3, last3, first4, mid4, first1, first2, comp);
+  }
+  else
+  {
+    thrust::merge_by_key(exec, first1, mid1, mid1, last1, first2, mid2, first3, first4, comp);
+  }
+}
+
+} // end namespace sort_detail
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator,
+         typename StrictWeakOrdering>
+void stable_sort(execution_policy<DerivedPolicy> &exec,
+                 RandomAccessIterator first,
+                 RandomAccessIterator last,
+                 StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator>::type key_type;
+
+  thrust::detail::temporary_array<key_type, DerivedPolicy> temp(exec, first, last);
+
+  sort_detail::merge_sort(exec, first, last, temp.begin(), comp, true);
+}
+
+template<typename DerivedPolicy,
+         typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename StrictWeakOrdering>
+  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
+                          RandomAccessIterator1 first1,
+                          RandomAccessIterator1 last1,
+                          RandomAccessIterator2 first2,
+                          StrictWeakOrdering comp)
+{
+  typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type;
+  typedef typename thrust::iterator_value<RandomAccessIterator2>::type val_type;
+
+  RandomAccessIterator2 last2 = first2 + thrust::distance(first1, last1);
+
+  thrust::detail::temporary_array<key_type, DerivedPolicy> temp1(exec, first1, last1);
+  thrust::detail::temporary_array<val_type, DerivedPolicy> temp2(exec, first2, last2);
+
+  sort_by_key_detail::merge_sort_by_key(exec, first1, last1, first2, temp1.begin(), temp2.begin(), comp, true);
+}
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/swap_ranges.h b/compat/thrust/system/tbb/detail/swap_ranges.h
new file mode 100644
index 0000000..15f8f55
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/swap_ranges.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// tbb inherits swap_ranges
+#include <thrust/system/cpp/detail/swap_ranges.h>
+
diff --git a/compat/thrust/system/tbb/detail/tabulate.h b/compat/thrust/system/tbb/detail/tabulate.h
new file mode 100644
index 0000000..da65d8e
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/tabulate.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits tabulate
+#include <thrust/system/cpp/detail/tabulate.h>
+
diff --git a/compat/thrust/system/tbb/detail/temporary_buffer.h b/compat/thrust/system/tbb/detail/temporary_buffer.h
new file mode 100644
index 0000000..628bd75
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/temporary_buffer.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special temporary buffer functions
+
diff --git a/compat/thrust/system/tbb/detail/transform.h b/compat/thrust/system/tbb/detail/transform.h
new file mode 100644
index 0000000..70ce1f4
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/transform.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// omp inherits transform
+#include <thrust/system/cpp/detail/transform.h>
+
diff --git a/compat/thrust/system/tbb/detail/transform_reduce.h b/compat/thrust/system/tbb/detail/transform_reduce.h
new file mode 100644
index 0000000..23ed070
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/transform_reduce.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_reduce
+#include <thrust/system/cpp/detail/transform_reduce.h>
+
diff --git a/compat/thrust/system/tbb/detail/transform_scan.h b/compat/thrust/system/tbb/detail/transform_scan.h
new file mode 100644
index 0000000..fc2e55d
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/transform_scan.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits transform_scan
+#include <thrust/system/cpp/detail/transform_scan.h>
+
diff --git a/compat/thrust/system/tbb/detail/uninitialized_copy.h b/compat/thrust/system/tbb/detail/uninitialized_copy.h
new file mode 100644
index 0000000..944f4ba
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/uninitialized_copy.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_copy
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+
diff --git a/compat/thrust/system/tbb/detail/uninitialized_fill.h b/compat/thrust/system/tbb/detail/uninitialized_fill.h
new file mode 100644
index 0000000..b9d6de2
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/uninitialized_fill.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system inherits uninitialized_fill
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+
diff --git a/compat/thrust/system/tbb/detail/unique.h b/compat/thrust/system/tbb/detail/unique.h
new file mode 100644
index 0000000..34538ca
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/unique.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename ExecutionPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<ExecutionPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred);
+
+
+template<typename ExecutionPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<ExecutionPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/unique.inl>
+
diff --git a/compat/thrust/system/tbb/detail/unique.inl b/compat/thrust/system/tbb/detail/unique.inl
new file mode 100644
index 0000000..06e6a30
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/unique.inl
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/unique.h>
+#include <thrust/system/detail/generic/unique.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
+                         ForwardIterator first,
+                         ForwardIterator last,
+                         BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique to cpp::unique
+  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
+} // end unique()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
+                             InputIterator first,
+                             InputIterator last,
+                             OutputIterator output,
+                             BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_copy to cpp::unique_copy
+  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
+} // end unique_copy()
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/unique_by_key.h b/compat/thrust/system/tbb/detail/unique_by_key.h
new file mode 100644
index 0000000..c6d0532
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/unique_by_key.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred);
+
+
+} // end namespace detail
+} // end namespace tbb 
+} // end namespace system
+} // end namespace thrust
+
+#include <thrust/system/tbb/detail/unique_by_key.inl>
+
diff --git a/compat/thrust/system/tbb/detail/unique_by_key.inl b/compat/thrust/system/tbb/detail/unique_by_key.inl
new file mode 100644
index 0000000..7747ca4
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/unique_by_key.inl
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+#include <thrust/system/detail/generic/unique_by_key.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+namespace detail
+{
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(execution_policy<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_by_key to cpp::unique_by_key
+  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
+} // end unique_by_key()
+
+
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_output,
+                       OutputIterator2 values_output,
+                       BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
+  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
+} // end unique_by_key_copy()
+
+
+} // end namespace detail
+} // end namespace tbb
+} // end namespace system
+} // end namespace thrust
+
diff --git a/compat/thrust/system/tbb/detail/vector.inl b/compat/thrust/system/tbb/detail/vector.inl
new file mode 100644
index 0000000..d87e670
--- /dev/null
+++ b/compat/thrust/system/tbb/detail/vector.inl
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/vector.h>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector()
+      : super_t()
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n)
+      : super_t(n)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(size_type n, const value_type &value)
+      : super_t(n,value)
+{}
+
+template<typename T, typename Allocator>
+  vector<T,Allocator>
+    ::vector(const vector &x)
+      : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator>
+      ::vector(const std::vector<OtherT,OtherAllocator> &x)
+        : super_t(x)
+{}
+
+template<typename T, typename Allocator>
+  template<typename InputIterator>
+    vector<T,Allocator>
+      ::vector(InputIterator first, InputIterator last)
+        : super_t(first,last)
+{}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+template<typename T, typename Allocator>
+  template<typename OtherT, typename OtherAllocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+      
+} // end tbb
+} // end system
+} // end thrust
+
diff --git a/compat/thrust/system/tbb/execution_policy.h b/compat/thrust/system/tbb/execution_policy.h
new file mode 100644
index 0000000..c462586
--- /dev/null
+++ b/compat/thrust/system/tbb/execution_policy.h
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/*! \file thrust/system/tbb/execution_policy.h
+ *  \brief Execution policies for Thrust's TBB system.
+ */
+
+#include <thrust/detail/config.h>
+
+// get the execution policies definitions first
+#include <thrust/system/tbb/detail/execution_policy.h>
+
+// get the definition of par
+#include <thrust/system/tbb/detail/par.h>
+
+// now get all the algorithm definitions
+
+#include <thrust/system/tbb/detail/adjacent_difference.h>
+#include <thrust/system/tbb/detail/assign_value.h>
+#include <thrust/system/tbb/detail/binary_search.h>
+#include <thrust/system/tbb/detail/copy.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#include <thrust/system/tbb/detail/count.h>
+#include <thrust/system/tbb/detail/equal.h>
+#include <thrust/system/tbb/detail/extrema.h>
+#include <thrust/system/tbb/detail/fill.h>
+#include <thrust/system/tbb/detail/find.h>
+#include <thrust/system/tbb/detail/for_each.h>
+#include <thrust/system/tbb/detail/gather.h>
+#include <thrust/system/tbb/detail/generate.h>
+#include <thrust/system/tbb/detail/get_value.h>
+#include <thrust/system/tbb/detail/inner_product.h>
+#include <thrust/system/tbb/detail/iter_swap.h>
+#include <thrust/system/tbb/detail/logical.h>
+#include <thrust/system/tbb/detail/malloc_and_free.h>
+#include <thrust/system/tbb/detail/merge.h>
+#include <thrust/system/tbb/detail/mismatch.h>
+#include <thrust/system/tbb/detail/partition.h>
+#include <thrust/system/tbb/detail/reduce.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#include <thrust/system/tbb/detail/remove.h>
+#include <thrust/system/tbb/detail/replace.h>
+#include <thrust/system/tbb/detail/reverse.h>
+#include <thrust/system/tbb/detail/scan.h>
+#include <thrust/system/tbb/detail/scan_by_key.h>
+#include <thrust/system/tbb/detail/scatter.h>
+#include <thrust/system/tbb/detail/sequence.h>
+#include <thrust/system/tbb/detail/set_operations.h>
+#include <thrust/system/tbb/detail/sort.h>
+#include <thrust/system/tbb/detail/swap_ranges.h>
+#include <thrust/system/tbb/detail/tabulate.h>
+#include <thrust/system/tbb/detail/transform.h>
+#include <thrust/system/tbb/detail/transform_reduce.h>
+#include <thrust/system/tbb/detail/transform_scan.h>
+#include <thrust/system/tbb/detail/uninitialized_copy.h>
+#include <thrust/system/tbb/detail/uninitialized_fill.h>
+#include <thrust/system/tbb/detail/unique.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+
+
+// define these entities here for the purpose of Doxygenating them
+// they are actually defined elsewhere
+#if 0
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+
+/*! \addtogroup execution_policies
+ *  \{
+ */
+
+
+/*! \p thrust::tbb::execution_policy is the base class for all Thrust parallel execution
+ *  policies which are derived from Thrust's TBB backend system.
+ */
+template<typename DerivedPolicy>
+struct execution_policy : thrust::execution_policy<DerivedPolicy>
+{};
+
+
+/*! \p tbb::tag is a type representing Thrust's TBB backend system in C++'s type system.
+ *  Iterators "tagged" with a type which is convertible to \p tbb::tag assert that they may be
+ *  "dispatched" to algorithm implementations in the \p tbb system.
+ */
+struct tag : thrust::system::tbb::execution_policy<tag> { unspecified };
+
+
+/*! \p thrust::tbb::par is the parallel execution policy associated with Thrust's TBB
+ *  backend system.
+ *
+ *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
+ *  directly target Thrust's TBB backend system by providing \p thrust::tbb::par as an algorithm
+ *  parameter.
+ *
+ *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
+ *  as \p thrust::tbb::vector.
+ *
+ *  The type of \p thrust::tbb::par is implementation-defined.
+ *
+ *  The following code snippet demonstrates how to use \p thrust::tbb::par to explicitly dispatch an
+ *  invocation of \p thrust::for_each to the TBB backend system:
+ *
+ *  \code
+ *  #include <thrust/for_each.h>
+ *  #include <thrust/system/tbb/execution_policy.h>
+ *  #include <cstdio>
+ *
+ *  struct printf_functor
+ *  {
+ *    __host__ __device__
+ *    void operator()(int x)
+ *    {
+ *      printf("%d\n");
+ *    }
+ *  };
+ *  ...
+ *  int vec[3];
+ *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *
+ *  thrust::for_each(thrust::tbb::par, vec.begin(), vec.end(), printf_functor());
+ *
+ *  // 0 1 2 is printed to standard output in some unspecified order
+ *  \endcode
+ */
+static const unspecified par;
+
+
+/*! \}
+ */
+
+
+} // end tbb
+} // end system
+} // end thrust
+#endif
+
+
diff --git a/compat/thrust/system/tbb/memory.h b/compat/thrust/system/tbb/memory.h
new file mode 100644
index 0000000..deea7ee
--- /dev/null
+++ b/compat/thrust/system/tbb/memory.h
@@ -0,0 +1,414 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/tbb/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/memory.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/allocator/malloc_allocator.h>
+#include <ostream>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+template<typename> class pointer;
+
+} // end tbb
+} // end system
+} // end thrust
+
+
+/*! \cond
+ */
+
+// specialize std::iterator_traits to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type
+// do this before pointer is defined so the specialization is correctly
+// used inside the definition
+namespace std
+{
+
+template<typename Element>
+  struct iterator_traits<thrust::system::tbb::pointer<Element> >
+{
+  private:
+    typedef thrust::system::tbb::pointer<Element> ptr;
+
+  public:
+    typedef typename ptr::iterator_category       iterator_category;
+    typedef typename ptr::value_type              value_type;
+    typedef typename ptr::difference_type         difference_type;
+    typedef ptr                                   pointer;
+    typedef typename ptr::reference               reference;
+}; // end iterator_traits
+
+} // end std
+
+/*! \endcond
+ */
+
+
+namespace thrust
+{
+namespace system
+{
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::tbb
+ *  \brief \p thrust::system::tbb is the namespace containing functionality for allocating, manipulating,
+ *         and deallocating memory available to Thrust's TBB backend system.
+ *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
+ *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
+ *         namespace for easy access.
+ *
+ */
+namespace tbb
+{
+
+// forward declaration of reference for pointer
+template<typename Element> class reference;
+
+/*! \cond
+ */
+
+// XXX nvcc + msvc have trouble instantiating reference below
+//     this is a workaround
+namespace detail
+{
+
+template<typename Element>
+  struct reference_msvc_workaround
+{
+  typedef thrust::system::tbb::reference<Element> type;
+}; // end reference_msvc_workaround
+
+} // end detail
+
+/*! \endcond
+ */
+
+
+/*! \p pointer stores a pointer to an object allocated in memory available to the tbb system.
+ *  This type provides type safety when dispatching standard algorithms on ranges resident
+ *  in tbb memory.
+ *
+ *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *
+ *  \p pointer can be created with the function \p tbb::malloc, or by explicitly calling its constructor
+ *  with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
+ *  or the \p raw_pointer_cast function.
+ *
+ *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
+ *  pointed to by \p pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see tbb::malloc
+ *  \see tbb::free
+ *  \see raw_pointer_cast
+ */
+template<typename T>
+  class pointer
+    : public thrust::pointer<
+               T,
+               thrust::system::tbb::tag,
+               thrust::system::tbb::reference<T>,
+               thrust::system::tbb::pointer<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::pointer<
+      T,
+      thrust::system::tbb::tag,
+      //thrust::system::tbb::reference<T>,
+      typename detail::reference_msvc_workaround<T>::type,
+      thrust::system::tbb::pointer<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    // note that tbb::pointer's member functions need __host__ __device__
+    // to interoperate with nvcc + iterators' dereference member function
+
+    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
+     */
+    __host__ __device__
+    pointer() : super_t() {}
+
+    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
+     *
+     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
+     *         accessible by the \p tbb system.
+     *  \tparam OtherT \p OtherT shall be convertible to \p T.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    explicit pointer(OtherT *ptr) : super_t(ptr) {}
+
+    /*! This constructor allows construction from another pointer-like object with related type.
+     *
+     *  \param other The \p OtherPointer to copy.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_pointer_is_convertible<
+              OtherPointer,
+              pointer
+            >::type * = 0) : super_t(other) {}
+
+    /*! Assignment operator allows assigning from another pointer-like object with related type.
+     *
+     *  \param other The other pointer-like object to assign from.
+     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
+     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
+     */
+    template<typename OtherPointer>
+    __host__ __device__
+    typename thrust::detail::enable_if_pointer_is_convertible<
+      OtherPointer,
+      pointer,
+      pointer &
+    >::type
+    operator=(const OtherPointer &other)
+    {
+      return super_t::operator=(other);
+    }
+}; // end pointer
+
+
+/*! \p reference is a wrapped reference to an object stored in memory available to the \p tbb system.
+ *  \p reference is the type of the result of dereferencing a \p tbb::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template<typename T>
+  class reference
+    : public thrust::reference<
+               T,
+               thrust::system::tbb::pointer<T>,
+               thrust::system::tbb::reference<T>
+             >
+{
+  /*! \cond
+   */
+
+  private:
+    typedef thrust::reference<
+      T,
+      thrust::system::tbb::pointer<T>,
+      thrust::system::tbb::reference<T>
+    > super_t;
+
+  /*! \endcond
+   */
+
+  public:
+    /*! \cond
+     */
+
+    typedef typename super_t::value_type value_type;
+    typedef typename super_t::pointer    pointer;
+
+    /*! \endcond
+     */
+
+    /*! This constructor initializes this \p reference to refer to an object
+     *  pointed to by the given \p pointer. After this \p reference is constructed,
+     *  it shall refer to the object pointed to by \p ptr.
+     *
+     *  \param ptr A \p pointer to copy from.
+     */
+    __host__ __device__
+    explicit reference(const pointer &ptr)
+      : super_t(ptr)
+    {}
+
+    /*! This constructor accepts a const reference to another \p reference of related type.
+     *  After this \p reference is constructed, it shall refer to the same object as \p other.
+     *
+     *  \param other A \p reference to copy from.
+     *  \tparam OtherT The element type of the other \p reference.
+     *
+     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
+     *        from <tt>reference<T></tt>.
+     */
+    template<typename OtherT>
+    __host__ __device__
+    reference(const reference<OtherT> &other,
+              typename thrust::detail::enable_if_convertible<
+                typename reference<OtherT>::pointer,
+                pointer
+              >::type * = 0)
+      : super_t(other)
+    {}
+
+    /*! Copy assignment operator copy assigns from another \p reference of related type.
+     *
+     *  \param other The other \p reference to assign from.
+     *  \return <tt>*this</tt>
+     *  \tparam OtherT The element type of the other \p reference.
+     */
+    template<typename OtherT>
+    reference &operator=(const reference<OtherT> &other);
+
+    /*! Assignment operator assigns from a \p value_type.
+     *
+     *  \param x The \p value_type to assign from.
+     *  \return <tt>*this</tt>
+     */
+    reference &operator=(const value_type &x);
+}; // end reference
+
+/*! Exchanges the values of two objects referred to by \p reference.
+ *  \p x The first \p reference of interest.
+ *  \p y The second \p reference ot interest.
+ */
+template<typename T>
+__host__ __device__
+void swap(reference<T> x, reference<T> y);
+
+/*! Allocates an area of memory available to Thrust's <tt>tbb</tt> system.
+ *  \param n Number of bytes to allocate.
+ *  \return A <tt>tbb::pointer<void></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>tbb::pointer<void></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>tbb::pointer<void></tt> returned by this function must be
+ *        deallocated with \p tbb::free.
+ *  \see tbb::free
+ *  \see std::malloc
+ */
+inline pointer<void> malloc(std::size_t n);
+
+/*! Allocates a typed area of memory available to Thrust's <tt>tbb</tt> system.
+ *  \param n Number of elements to allocate.
+ *  \return A <tt>tbb::pointer<T></tt> pointing to the beginning of the newly
+ *          allocated memory. A null <tt>tbb::pointer<T></tt> is returned if
+ *          an error occurs.
+ *  \note The <tt>tbb::pointer<T></tt> returned by this function must be
+ *        deallocated with \p tbb::free.
+ *  \see tbb::free
+ *  \see std::malloc
+ */
+template<typename T>
+inline pointer<T> malloc(std::size_t n);
+
+/*! Deallocates an area of memory previously allocated by <tt>tbb::malloc</tt>.
+ *  \param ptr A <tt>tbb::pointer<void></tt> pointing to the beginning of an area
+ *         of memory previously allocated with <tt>tbb::malloc</tt>.
+ *  \see tbb::malloc
+ *  \see std::free
+ */
+inline void free(pointer<void> ptr);
+
+// XXX upon c++11
+// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
+
+/*! \p tbb::allocator is the default allocator used by the \p tbb system's containers such as
+ *  <tt>tbb::vector</tt> if no user-specified allocator is provided. \p tbb::allocator allocates
+ *  (deallocates) storage with \p tbb::malloc (\p tbb::free).
+ */
+template<typename T>
+  struct allocator
+    : thrust::detail::malloc_allocator<
+        T,
+        tag,
+        pointer<T>
+      >
+{
+  /*! The \p rebind metafunction provides the type of an \p allocator
+   *  instantiated with another type.
+   *
+   *  \tparam U The other type to use for instantiation.
+   */
+  template<typename U>
+    struct rebind
+  {
+    /*! The typedef \p other gives the type of the rebound \p allocator.
+     */
+    typedef allocator<U> other;
+  };
+
+  /*! No-argument constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator() {}
+
+  /*! Copy constructor has no effect.
+   */
+  __host__ __device__
+  inline allocator(const allocator &) {}
+
+  /*! Constructor from other \p allocator has no effect.
+   */
+  template<typename U>
+  __host__ __device__
+  inline allocator(const allocator<U> &) {}
+
+  /*! Destructor has no effect.
+   */
+  __host__ __device__
+  inline ~allocator() {}
+}; // end allocator
+
+} // end tbb
+
+/*! \}
+ */
+
+} // end system
+
+/*! \namespace thrust::tbb
+ *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
+ */
+namespace tbb
+{
+
+using thrust::system::tbb::pointer;
+using thrust::system::tbb::reference;
+using thrust::system::tbb::malloc;
+using thrust::system::tbb::free;
+using thrust::system::tbb::allocator;
+
+} // end tbb
+
+} // end thrust
+
+#include <thrust/system/tbb/detail/memory.inl>
+
diff --git a/compat/thrust/system/tbb/vector.h b/compat/thrust/system/tbb/vector.h
new file mode 100644
index 0000000..1c49c3f
--- /dev/null
+++ b/compat/thrust/system/tbb/vector.h
@@ -0,0 +1,144 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in ctbbliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/tbb/vector.h
+ *  \brief A dynamically-sizable array of elements which reside in memory available to
+ *         Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/tbb/memory.h>
+#include <thrust/detail/vector_base.h>
+#include <vector>
+
+namespace thrust
+{
+namespace system
+{
+namespace tbb
+{
+
+// XXX upon c++11
+// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
+
+/*! \p tbb::vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p tbb::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p tbb::vector reside in memory
+ *  available to the \p tbb system.
+ *
+ *  \tparam T The element type of the \p tbb::vector.
+ *  \tparam Allocator The allocator type of the \p tbb::vector. Defaults to \p tbb::allocator.
+ *
+ *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p tbb::vector
+ *  \see device_vector
+ */
+template<typename T, typename Allocator = allocator<T> >
+  class vector
+    : public thrust::detail::vector_base<T,Allocator>
+{
+  /*! \cond
+   */
+  private:
+    typedef thrust::detail::vector_base<T,Allocator> super_t;
+  /*! \endcond
+   */
+
+  public:
+
+  /*! \cond
+   */
+    typedef typename super_t::size_type  size_type;
+    typedef typename super_t::value_type value_type;
+  /*! \endcond
+   */
+
+    /*! This constructor creates an empty \p tbb::vector.
+     */
+    vector();
+
+    /*! This constructor creates a \p tbb::vector with \p n default-constructed elements.
+     *  \param n The size of the \p tbb::vector to create.
+     */
+    explicit vector(size_type n);
+
+    /*! This constructor creates a \p tbb::vector with \p n copies of \p value.
+     *  \param n The size of the \p tbb::vector to create.
+     *  \param value An element to copy.
+     */
+    explicit vector(size_type n, const value_type &value);
+
+    /*! Copy constructor copies from another \p tbb::vector.
+     *  \param x The other \p tbb::vector to copy.
+     */
+    vector(const vector &x);
+
+    /*! This constructor copies from another Thrust vector-like object.
+     *  \param x The other object to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+
+    /*! This constructor copies from a \c std::vector.
+     *  \param x The \c std::vector to copy from.
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! This constructor creates a \p tbb::vector by copying from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     */
+    template<typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+
+    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
+
+    /*! Assignment operator assigns from a \c std::vector.
+     *  \param x The \c std::vector to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
+
+    /*! Assignment operator assigns from another Thrust vector-like object.
+     *  \param x The other object to assign from.
+     *  \return <tt>*this</tt>
+     */
+    template<typename OtherT, typename OtherAllocator>
+    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
+}; // end vector
+
+} // end tbb
+} // end system
+
+// alias system::tbb names at top-level
+namespace tbb
+{
+
+using thrust::system::tbb::vector;
+
+} // end tbb
+
+} // end thrust
+
+#include <thrust/system/tbb/detail/vector.inl>
+
diff --git a/compat/thrust/system_error.h b/compat/thrust/system_error.h
new file mode 100644
index 0000000..ce88fe6
--- /dev/null
+++ b/compat/thrust/system_error.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system_error.h
+ *  \brief System diagnostics
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+namespace thrust
+{
+
+/*! \addtogroup system System Access
+ *  \{
+ */
+
+/*! \namespace thrust::system
+ *  \brief \p thrust::system is the namespace which contains functionality for manipulating
+ *         memory specific to one of Thrust's backend systems. It also contains functionality
+ *         for reporting error conditions originating from the operating system or other
+ *         low-level application program interfaces such as the CUDA runtime.
+ *         They are provided in a separate namespace for import convenience but are
+ *         also aliased in the top-level \p thrust namespace for easy access.
+ */
+namespace system
+{
+} // end system
+
+/*! \} // end system
+ */
+
+} // end thrust
+
+#include <thrust/system/error_code.h>
+#include <thrust/system/system_error.h>
+
diff --git a/compat/thrust/tabulate.h b/compat/thrust/tabulate.h
new file mode 100644
index 0000000..c87edf0
--- /dev/null
+++ b/compat/thrust/tabulate.h
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file tabulate.h
+ *  \brief Fills a range with the tabulation of a function
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup transformations
+ *  \{
+ */
+
+
+/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
+ *     element's index.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
+ *  <tt>*i = unary_op(i - first)</tt>.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the range.
+ *  \param last The end of the range.
+ *  \param unary_op The unary operation to apply.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/tabulate.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::tabulate(thrust::host, A, A + 10, thrust::negate<int>());
+ *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
+ *  \endcode
+ *
+ *  \see thrust::fill
+ *  \see thrust::generate
+ *  \see thrust::sequence
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
+  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+
+/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
+ *     element's index.
+ *
+ *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
+ *  <tt>*i = unary_op(i - first)</tt>.
+ *
+ *  \param first The beginning of the range.
+ *  \param last The end of the range.
+ *  \param unary_op The unary operation to apply.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
+ *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
+ *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers:
+ *
+ *  \code
+ *  #include <thrust/tabulate.h>
+ *  #include <thrust/functional.h>
+ *  ...
+ *  const int N = 10;
+ *  int A[N];
+ *  thrust::tabulate(A, A + 10, thrust::negate<int>());
+ *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
+ *  \endcode
+ *
+ *  \see thrust::fill
+ *  \see thrust::generate
+ *  \see thrust::sequence
+ */
+template<typename ForwardIterator, typename UnaryOperation>
+  void tabulate(ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/tabulate.inl>
+
diff --git a/compat/thrust/transform.h b/compat/thrust/transform.h
new file mode 100644
index 0000000..1ada105
--- /dev/null
+++ b/compat/thrust/transform.h
@@ -0,0 +1,720 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform.h
+ *  \brief Transforms input ranges using a function object
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup transformations
+ *  \ingroup algorithms
+ *  \{
+ */
+
+
+/*! This version of \p transform applies a unary function to each element
+ *  of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence.  Specifically, for each iterator 
+ *  <tt>i</tt> in the range [\p first, \p last) the operation 
+ *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  [\p result, \p result + (\p last - \p first) ).  The input and
+ *  output sequences may coincide, resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform to negate a range in-place
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::negate<int> op;
+ *
+ *  thrust::transform(thrust::host, data, data + 10, data, op); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+	
+/*! This version of \p transform applies a unary function to each element
+ *  of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence.  Specifically, for each iterator 
+ *  <tt>i</tt> in the range [\p first, \p last) the operation 
+ *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  [\p result, \p result + (\p last - \p first) ).  The input and
+ *  output sequences may coincide, resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ * 
+ *  thrust::negate<int> op;
+ *
+ *  thrust::transform(data, data + 10, data, op); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction>
+  OutputIterator transform(InputIterator first, InputIterator last,
+                           OutputIterator result,
+                           UnaryFunction op);
+
+
+/*! This version of \p transform applies a binary function to each pair
+ *  of elements from two input sequences and stores the result in the
+ *  corresponding position in an output sequence.  Specifically, for
+ *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
+ *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
+ *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
+ *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
+ *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
+ *  The input and output sequences may coincide, resulting in an 
+ *  in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform to compute the sum of two
+ *  ranges using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int input1[6] = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6] = { 3,  6, -2,  1,  2,  3};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *
+ *  thrust::transform(thrust::host, input1, input1 + 6, input2, output, op);
+ *
+ *  // output is now {-2,  6,  0,  4,  4,  7};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+
+/*! This version of \p transform applies a binary function to each pair
+ *  of elements from two input sequences and stores the result in the
+ *  corresponding position in an output sequence.  Specifically, for
+ *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
+ *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
+ *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
+ *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
+ *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
+ *  The input and output sequences may coincide, resulting in an 
+ *  in-place transformation.
+ *    
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int input1[6] = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6] = { 3,  6, -2,  1,  2,  3};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *
+ *  thrust::transform(input1, input1 + 6, input2, output, op);
+ *
+ *  // output is now {-2,  6,  0,  4,  4,  7};
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/transform.html
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator,
+         typename BinaryFunction>
+  OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
+                           InputIterator2 first2,
+                           OutputIterator result,
+                           BinaryFunction op);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in the input sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if to negate the odd-valued
+ *  elements of a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x % 2;
+ *    }
+ *  };
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  // negate odd elements
+ *  thrust::transform_if(thrust::host, data, data + 10, data, op, is_odd()); // in-place transformation
+ *
+ *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in the input sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
+ *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *
+ *  struct is_odd
+ *  {
+ *    __host__ __device__
+ *    bool operator()(int x)
+ *    {
+ *      return x % 2;
+ *    }
+ *  };
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  // negate odd elements
+ *  thrust::transform_if(data, data + 10, data, op, is_odd()); // in-place transformation
+ *
+ *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator first, InputIterator last,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satisfies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(thrust::host, data, data + 10, stencil, data, op, identity); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a unary function
+ *  to each element of an input sequence and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satisfies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
+ *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param op The tranformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
+ *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
+ * 
+ *  thrust::negate<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(data, data + 10, stencil, data, op, identity); // in-place transformation
+ *
+ *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename ForwardIterator,
+         typename UnaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first, InputIterator1 last,
+                               InputIterator2 stencil,
+                               ForwardIterator result,
+                               UnaryFunction op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a binary function
+ *  to each pair of elements from two input sequences and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
+ *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
+ *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *    
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The transformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
+ *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(thrust::host, input1, input1 + 6, input2, stencil, output, op, identity);
+ *
+ *  // output is now {-2,  0,  0,  3,  4,  4};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                               InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+
+/*! This version of \p transform_if conditionally applies a binary function
+ *  to each pair of elements from two input sequences and stores the result in the corresponding 
+ *  position in an output sequence if the corresponding position in a stencil sequence
+ *  satifies a predicate. Otherwise, the corresponding position in the
+ *  output sequence is not modified.
+ *
+ *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
+ *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
+ *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
+ *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
+ *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
+ *  where <tt>o</tt> is the corresponding output iterator in the range
+ *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
+ *  not evaluated and no assignment occurs. The input and output sequences may coincide,
+ *  resulting in an in-place transformation.
+ *    
+ *  \param first1 The beginning of the first input sequence.
+ *  \param last1 The end of the first input sequence.
+ *  \param first2 The beginning of the second input sequence.
+ *  \param stencil The beginning of the stencil sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param binary_op The transformation operation.
+ *  \param pred The predicate operation.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *
+ *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_if:
+ *
+ *  \code
+ *  #include <thrust/transform.h>
+ *  #include <thrust/functional.h>
+ *  
+ *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
+ *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
+ *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
+ *  int output[6];
+ * 
+ *  thrust::plus<int> op;
+ *  thrust::identity<int> identity;
+ *
+ *  thrust::transform_if(input1, input1 + 6, input2, stencil, output, op, identity);
+ *
+ *  // output is now {-2,  0,  0,  3,  4,  4};
+ *  \endcode
+ *
+ *  \see thrust::transform
+ */
+template<typename InputIterator1,
+         typename InputIterator2,
+         typename InputIterator3,
+         typename ForwardIterator,
+         typename BinaryFunction,
+         typename Predicate>
+  ForwardIterator transform_if(InputIterator1 first1, InputIterator1 last1,
+                               InputIterator2 first2,
+                               InputIterator3 stencil,
+                               ForwardIterator result,
+                               BinaryFunction binary_op,
+                               Predicate pred);
+
+
+/*! \} // end transformations
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/transform.inl>
+
diff --git a/compat/thrust/transform_reduce.h b/compat/thrust/transform_reduce.h
new file mode 100644
index 0000000..3ef5efd
--- /dev/null
+++ b/compat/thrust/transform_reduce.h
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_reduce.h
+ *  \brief Fused transform / reduction
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup reductions
+ *  \{
+ *  \addtogroup transformed_reductions Transformed Reductions
+ *  \ingroup reductions
+ *  \{
+ */
+
+
+/*! \p transform_reduce fuses the \p transform and \p reduce operations.
+ *  \p transform_reduce is equivalent to performing a transformation defined by
+ *  \p unary_op into a temporary sequence and then performing \p reduce on the
+ *  transformed sequence. In most cases, fusing these two operations together is
+ *  more efficient, since fewer memory reads and writes are required.
+ *
+ *  \p transform_reduce performs a reduction on the transformation of the
+ *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
+ *  \p unary_op is applied to each element of the sequence and then the result
+ *  is reduced to a single value with \p binary_op using the initial value 
+ *  \p init.  Note that the transformation \p unary_op is not applied to 
+ *  the initial value \p init.  The order of reduction is not specified, 
+ *  so \p binary_op must be both commutative and associative. 
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param unary_op The function to apply to each element of the input sequence.
+ *  \param init The result is initialized to this value.
+ *  \param binary_op The reduction operation.
+ *  \return The result of the transformed reduction.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p transform_reduce
+ *  to compute the maximum value of the absolute value of the elements
+ *  of a range using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_reduce.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/execution_policy.h>
+ *
+ *  template<typename T>
+ *  struct absolute_value : public unary_function<T,T>
+ *  {
+ *    __host__ __device__ T operator()(const T &x) const
+ *    {
+ *      return x < T(0) ? -x : x;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int data[6] = {-1, 0, -2, -2, 1, -3};
+ *  int result = thrust::transform_reduce(thrust::host,
+ *                                        data, data + 6,
+ *                                        absolute_value<int>(),
+ *                                        0,
+ *                                        thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see \c transform
+ *  \see \c reduce
+ */
+template<typename DerivedPolicy,
+         typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                              InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+
+/*! \p transform_reduce fuses the \p transform and \p reduce operations.
+ *  \p transform_reduce is equivalent to performing a transformation defined by
+ *  \p unary_op into a temporary sequence and then performing \p reduce on the
+ *  transformed sequence. In most cases, fusing these two operations together is
+ *  more efficient, since fewer memory reads and writes are required.
+ *
+ *  \p transform_reduce performs a reduction on the transformation of the
+ *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
+ *  \p unary_op is applied to each element of the sequence and then the result
+ *  is reduced to a single value with \p binary_op using the initial value 
+ *  \p init.  Note that the transformation \p unary_op is not applied to 
+ *  the initial value \p init.  The order of reduction is not specified, 
+ *  so \p binary_op must be both commutative and associative. 
+ *
+ *  \param first The beginning of the sequence.
+ *  \param last The end of the sequence.
+ *  \param unary_op The function to apply to each element of the input sequence.
+ *  \param init The result is initialized to this value.
+ *  \param binary_op The reduction operation.
+ *  \return The result of the transformed reduction.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
+ *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
+ *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
+ *
+ *  The following code snippet demonstrates how to use \p transform_reduce
+ *  to compute the maximum value of the absolute value of the elements
+ *  of a range.
+ *
+ *  \code
+ *  #include <thrust/transform_reduce.h>
+ *  #include <thrust/functional.h>
+ *
+ *  template<typename T>
+ *  struct absolute_value : public unary_function<T,T>
+ *  {
+ *    __host__ __device__ T operator()(const T &x) const
+ *    {
+ *      return x < T(0) ? -x : x;
+ *    }
+ *  };
+ *
+ *  ...
+ *
+ *  int data[6] = {-1, 0, -2, -2, 1, -3};
+ *  int result = thrust::transform_reduce(data, data + 6,
+ *                                        absolute_value<int>(),
+ *                                        0,
+ *                                        thrust::maximum<int>());
+ *  // result == 3
+ *  \endcode
+ *
+ *  \see \c transform
+ *  \see \c reduce
+ */
+template<typename InputIterator, 
+         typename UnaryFunction, 
+         typename OutputType,
+         typename BinaryFunction>
+  OutputType transform_reduce(InputIterator first,
+                              InputIterator last,
+                              UnaryFunction unary_op,
+                              OutputType init,
+                              BinaryFunction binary_op);
+
+
+/*! \} // end transformed_reductions
+ *  \} // end reductions
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/transform_reduce.inl>
+
diff --git a/compat/thrust/transform_scan.h b/compat/thrust/transform_scan.h
new file mode 100644
index 0000000..e9943e4
--- /dev/null
+++ b/compat/thrust/transform_scan.h
@@ -0,0 +1,322 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file transform_scan.h
+ *  \brief Fused transform / prefix-sum
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup algorithms
+ */
+
+/*! \addtogroup prefixsums Prefix Sums
+ *  \ingroup algorithms
+ *  \{
+ */
+	
+/*! \addtogroup transformed_prefixsums Transformed Prefix Sums
+ *  \ingroup prefixsums
+ *  \{
+ */
+
+
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
+ *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
+ *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
+ *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
+ *  operation is permitted to be in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(thrust::host, data, data + 6, data, unary_op, binary_op); // in-place scan
+ *
+ *  // data is now {-1, -1, -3, -5, -6, -9}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
+ *  operations.  \p transform_inclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p inclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
+ *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
+ *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
+ *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
+ *  operation is permitted to be in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_inclusive_scan
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_inclusive_scan(data, data + 6, data, unary_op, binary_op); // in-place scan
+ *
+ *  // data is now {-1, -1, -3, -5, -6, -9}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p inclusive_scan
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename AssociativeOperator>
+  OutputIterator transform_inclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
+ *  operations.  \p transform_exclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p exclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In 
+ *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
+ *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
+ *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
+ *  permitted to be in-place.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p exclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_exclusive_scan using the
+ *  \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_exclusive_scan(thrust::host, data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {4, 3, 3, 1, -1, -2}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p exclusive_scan
+ *
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+
+/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
+ *  operations.  \p transform_exclusive_scan is equivalent to performing a
+ *  tranformation defined by \p unary_op into a temporary sequence and then
+ *  performing an \p exclusive_scan on the tranformed sequence.  In most
+ *  cases, fusing these two operations together is more efficient, since
+ *  fewer memory reads and writes are required. In 
+ *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
+ *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
+ *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
+ *  permitted to be in-place.
+ *
+ *  \param first The beginning of the input sequence.
+ *  \param last The end of the input sequence.
+ *  \param result The beginning of the output sequence.
+ *  \param unary_op The function used to tranform the input sequence.
+ *  \param init The initial value of the \p exclusive_scan
+ *  \param binary_op The associatve operator used to 'sum' transformed values.
+ *  \return The end of the output sequence.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
+ *                               is convertable to \c OutputIterator's \c value_type.
+ *  \tparam T is convertible to \c OutputIterator's \c value_type.
+ *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *                              and \c AssociativeOperator's \c result_type is
+ *                              convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p transform_exclusive_scan
+ *
+ *  \code
+ *  #include <thrust/transform_scan.h>
+ *  
+ *  int data[6] = {1, 0, 2, 2, 1, 3};
+ *
+ *  thrust::negate<int> unary_op;
+ *  thrust::plus<int> binary_op;
+ *
+ *  thrust::transform_exclusive_scan(data, data + 6, data, unary_op, 4, binary_op); // in-place scan
+ *
+ *  // data is now {4, 3, 3, 1, -1, -2}
+ *  \endcode
+ *
+ *  \see \p transform
+ *  \see \p exclusive_scan
+ *
+ */
+template<typename InputIterator,
+         typename OutputIterator,
+         typename UnaryFunction,
+         typename T,
+         typename AssociativeOperator>
+  OutputIterator transform_exclusive_scan(InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator result,
+                                          UnaryFunction unary_op,
+                                          T init,
+                                          AssociativeOperator binary_op);
+
+
+/*! \} // end transformed_prefixsums
+ */
+
+
+/*! \} // end prefixsums
+ */
+
+	
+} // end namespace thrust
+
+#include <thrust/detail/transform_scan.inl>
+
diff --git a/compat/thrust/tuple.h b/compat/thrust/tuple.h
new file mode 100644
index 0000000..3961d98
--- /dev/null
+++ b/compat/thrust/tuple.h
@@ -0,0 +1,583 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file tuple.h
+ *  \brief A type encapsulating a heterogeneous collection of elements
+ */
+
+/*
+ * Copyright (C) 1999, 2000 Jaakko Järvi (jaakko.jarvi@cs.utu.fi)
+ * 
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See accompanying NOTICE file for the complete license)
+ *
+ * For more information, see http://www.boost.org
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/tuple.inl>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup tuple
+ *  \{
+ */
+
+/*! \cond
+ */
+
+struct null_type;
+
+/*! \endcond
+ */
+
+/*! This metafunction returns the type of a
+ *  \p tuple's <tt>N</tt>th element.
+ *
+ *  \tparam N This parameter selects the element of interest.
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class T>
+  struct tuple_element
+{
+  private:
+    typedef typename T::tail_type Next;
+
+  public:
+    /*! The result of this metafunction is returned in \c type.
+     */
+    typedef typename tuple_element<N-1, Next>::type type;
+}; // end tuple_element
+
+/*! This metafunction returns the number of elements
+ *  of a \p tuple type of interest.
+ *
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<class T>
+  struct tuple_size
+{
+  /*! The result of this metafunction is returned in \c value.
+   */
+  static const int value = 1 + tuple_size<typename T::tail_type>::value;
+}; // end tuple_size
+
+// get function for non-const cons-lists, returns a reference to the element
+
+/*! The \p get function returns a reference to a \p tuple element of
+ *  interest.
+ *
+ *  \param t A reference to a \p tuple of interest.
+ *  \return A reference to \p t's <tt>N</tt>th element.
+ *
+ *  \tparam N The index of the element of interest.
+ *
+ *  The following code snippet demonstrates how to use \p get to print
+ *  the value of a \p tuple element.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  thrust::tuple<int, const char *> t(13, "thrust");
+ *
+ *  std::cout << "The 1st value of t is " << thrust::get<1>(t) << std::endl;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::non_const_type
+get(detail::cons<HT, TT>& t);
+
+
+/*! The \p get function returns a \c const reference to a \p tuple element of
+ *  interest.
+ *
+ *  \param t A reference to a \p tuple of interest.
+ *  \return A \c const reference to \p t's <tt>N</tt>th element.
+ *
+ *  \tparam N The index of the element of interest.
+ *
+ *  The following code snippet demonstrates how to use \p get to print
+ *  the value of a \p tuple element.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  thrust::tuple<int, const char *> t(13, "thrust");
+ *
+ *  std::cout << "The 1st value of t is " << thrust::get<1>(t) << std::endl;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<int N, class HT, class TT>
+__host__ __device__
+inline typename access_traits<
+                  typename tuple_element<N, detail::cons<HT, TT> >::type
+                >::const_type
+get(const detail::cons<HT, TT>& t);
+
+
+
+/*! \p tuple is a class template that can be instantiated with up to ten arguments.
+ *  Each template argument specifies the type of element in the \p tuple.
+ *  Consequently, tuples are heterogeneous, fixed-size collections of values. An
+ *  instantiation of \p tuple with two arguments is similar to an instantiation
+ *  of \p pair with the same two arguments. Individual elements of a \p tuple may
+ *  be accessed with the \p get function.
+ *
+ *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
+ *          type currently supports up to ten elements.
+ *
+ *  The following code snippet demonstrates how to create a new \p tuple object
+ *  and inspect and modify the value of its elements.
+ *
+ *  \code
+ *  #include <thrust/tuple.h>
+ *  #include <iostream>
+ *  ...
+ *  // create a tuple containing an int, a float, and a string
+ *  thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
+ *
+ *  // individual members are accessed with the free function get
+ *  std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl; 
+ *
+ *  // or the member function get
+ *  std::cout << "The second element's value is " << t.get<1>() << std::endl;
+ *
+ *  // we can also modify elements with the same function
+ *  thrust::get<0>(t) += 10;
+ *  \endcode
+ *
+ *  \see pair
+ *  \see get
+ *  \see make_tuple
+ *  \see tuple_element
+ *  \see tuple_size
+ *  \see tie
+ */
+template <class T0, class T1, class T2, class T3, class T4,
+          class T5, class T6, class T7, class T8, class T9>
+  class tuple :
+    public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+{
+  /*! \cond
+   */
+
+  private:
+  typedef typename detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type inherited;
+
+  /*! \endcond
+   */
+
+  public:
+  /*! \p tuple's no-argument constructor initializes each element.
+   */
+  inline __host__ __device__
+  tuple(void) {}
+
+  /*! \p tuple's one-argument constructor copy constructs the first element from the given parameter
+   *     and intializes all other elements.
+   *  \param t0 The value to assign to this \p tuple's first element.
+   */
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0)
+    : inherited(t0,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  /*! \p tuple's one-argument constructor copy constructs the first two elements from the given parameters
+   *     and intializes all other elements.
+   *  \param t0 The value to assign to this \p tuple's first element.
+   *  \param t1 The value to assign to this \p tuple's second element.
+   *  \note \p tuple's constructor has ten variants of this form, the rest of which are ommitted here for brevity.
+   */
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1)
+    : inherited(t0, t1,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  /*! \cond
+   */
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2)
+    : inherited(t0, t1, t2,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3)
+    : inherited(t0, t1, t2, t3,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4)
+    : inherited(t0, t1, t2, t3, t4,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5)
+    : inherited(t0, t1, t2, t3, t4, t5,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6)
+    : inherited(t0, t1, t2, t3, t4, t5, t6,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7,
+                static_cast<const null_type&>(null_type()),
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7,
+        typename access_traits<T8>::parameter_type t8)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8,
+                static_cast<const null_type&>(null_type())) {}
+
+  inline __host__ __device__ 
+  tuple(typename access_traits<T0>::parameter_type t0,
+        typename access_traits<T1>::parameter_type t1,
+        typename access_traits<T2>::parameter_type t2,
+        typename access_traits<T3>::parameter_type t3,
+        typename access_traits<T4>::parameter_type t4,
+        typename access_traits<T5>::parameter_type t5,
+        typename access_traits<T6>::parameter_type t6,
+        typename access_traits<T7>::parameter_type t7,
+        typename access_traits<T8>::parameter_type t8,
+        typename access_traits<T9>::parameter_type t9)
+    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) {}
+
+
+  template<class U1, class U2>
+  inline __host__ __device__ 
+  tuple(const detail::cons<U1, U2>& p) : inherited(p) {}
+
+  template <class U1, class U2>
+  inline __host__ __device__ 
+  tuple& operator=(const detail::cons<U1, U2>& k)
+  {
+    inherited::operator=(k);
+    return *this;
+  }
+
+  /*! \endcond
+   */
+
+  /*! This assignment operator allows assigning the first two elements of this \p tuple from a \p pair.
+   *  \param k A \p pair to assign from.
+   */
+  template <class U1, class U2>
+  __host__ __device__ inline
+  tuple& operator=(const thrust::pair<U1, U2>& k) {
+    //BOOST_STATIC_ASSERT(length<tuple>::value == 2);// check_length = 2
+    this->head = k.first;
+    this->tail.head = k.second;
+    return *this;
+  }
+
+  /*! \p swap swaps the elements of two <tt>tuple</tt>s.
+   *
+   *  \param t The other <tt>tuple</tt> with which to swap.
+   */
+  inline __host__ __device__
+  void swap(tuple &t)
+  {
+    inherited::swap(t);
+  }
+};
+
+/*! \cond
+ */
+
+template <>
+class tuple<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>  :
+  public null_type
+{
+public:
+  typedef null_type inherited;
+};
+
+/*! \endcond
+ */
+
+
+/*! This version of \p make_tuple creates a new \c tuple object from a
+ *  single object.
+ *
+ *  \param t0 The object to copy from.
+ *  \return A \p tuple object with a single member which is a copy of \p t0.
+ */
+template<class T0>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0>::type
+    make_tuple(const T0& t0);
+
+/*! This version of \p make_tuple creates a new \c tuple object from two
+ *  objects.
+ *
+ *  \param t0 The first object to copy from.
+ *  \param t1 The second object to copy from.
+ *  \return A \p tuple object with two members which are copies of \p t0
+ *          and \p t1.
+ *
+ *  \note \p make_tuple has ten variants, the rest of which are omitted here
+ *        for brevity.
+ */
+template<class T0, class T1>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1>::type
+    make_tuple(const T0& t0, const T1& t1);
+
+/*! This version of \p tie creates a new \c tuple whose single element is
+ *  a reference which refers to this function's argument.
+ *
+ *  \param t0 The object to reference.
+ *  \return A \p tuple object with one member which is a reference to \p t0.
+ */
+template<typename T0>
+__host__ __device__ inline
+tuple<T0&> tie(T0& t0);
+
+/*! This version of \p tie creates a new \c tuple of references object which
+ *  refers to this function's arguments.
+ *
+ *  \param t0 The first object to reference.
+ *  \param t1 The second object to reference.
+ *  \return A \p tuple object with two members which are references to \p t0
+ *          and \p t1.
+ *
+ *  \note \p tie has ten variants, the rest of which are omitted here for
+ *           brevity.
+ */
+template<typename T0, typename T1>
+__host__ __device__ inline
+tuple<T0&,T1&> tie(T0& t0, T1& t1);
+
+/*! \p swap swaps the contents of two <tt>tuple</tt>s.
+ *
+ *  \param x The first \p tuple to swap.
+ *  \param y The second \p tuple to swap.
+ */
+template<
+  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
+  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+>
+inline __host__ __device__
+void swap(tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
+          tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y);
+
+
+
+/*! \cond
+ */
+
+template<class T0, class T1, class T2>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2);
+
+template<class T0, class T1, class T2, class T3>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3);
+
+template<class T0, class T1, class T2, class T3, class T4>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8);
+
+template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
+__host__ __device__ inline
+  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9);
+
+template<typename T0, typename T1, typename T2>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2);
+
+template<typename T0, typename T1, typename T2, typename T3>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8);
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+__host__ __device__ inline
+tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9);
+
+
+__host__ __device__ inline
+bool operator==(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator>=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator<=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator!=(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator<(const null_type&, const null_type&);
+
+__host__ __device__ inline
+bool operator>(const null_type&, const null_type&);
+
+/*! \endcond
+ */
+
+/*! \} // tuple
+ */
+
+/*! \} // utility
+ */
+
+} // end thrust
+
diff --git a/compat/thrust/uninitialized_copy.h b/compat/thrust/uninitialized_copy.h
new file mode 100644
index 0000000..77b673c
--- /dev/null
+++ b/compat/thrust/uninitialized_copy.h
@@ -0,0 +1,301 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_copy.h
+ *  \brief Copy construction into a range of uninitialized elements from a source range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup copying
+ *  \{
+ */
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy creates a copy of
+ *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p ForwardIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the input range to copy from.
+ *  \param last The last element of the input range to copy from.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy(thrust::device, input.begin(), input.end(), array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
+  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                     InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy creates a copy of
+ *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p ForwardIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  \param first The first element of the input range to copy from.
+ *  \param last The last element of the input range to copy from.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy(input.begin(), input.end(), array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename InputIterator, typename ForwardIterator>
+  ForwardIterator uninitialized_copy(InputIterator first,
+                                     InputIterator last,
+                                     ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
+ *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p InputIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the input range to copy from.
+ *  \param n The number of elements to copy.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Size is an integral type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory using the \p thrust::device execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy_n(thrust::device, input.begin(), N, array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c uninitialized_copy
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a constructor.
+ *  Occasionally, however, it is useful to separate those two operations.
+ *  If each iterator in the range <tt>[result, result + n)</tt> points
+ *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
+ *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
+ *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
+ *  to by the corresponding iterator in the output range by \p InputIterator's
+ *  \c value_type's copy constructor with *i as its argument.
+ *
+ *  \param first The first element of the input range to copy from.
+ *  \param n The number of elements to copy.
+ *  \param result The first element of the output range to copy to.
+ *  \return An iterator pointing to the last element of the output range.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam Size is an integral type.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
+ *          a single argument whose type is \p InputIterator's \c value_type.
+ *
+ *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
+ *  a range of uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_copy.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/device_vector.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_vector<Int> input(N, val);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_copy_n(input.begin(), N, array);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see \c uninitialized_copy
+ *  \see \c copy
+ *  \see \c uninitialized_fill
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename InputIterator, typename Size, typename ForwardIterator>
+  ForwardIterator uninitialized_copy_n(InputIterator first,
+                                       Size n,
+                                       ForwardIterator result);
+
+
+/*! \} // copying
+ */
+
+
+} // end thrust
+
+#include <thrust/detail/uninitialized_copy.inl>
+
diff --git a/compat/thrust/uninitialized_fill.h b/compat/thrust/uninitialized_fill.h
new file mode 100644
index 0000000..c726241
--- /dev/null
+++ b/compat/thrust/uninitialized_fill.h
@@ -0,0 +1,273 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file uninitialized_fill.h
+ *  \brief Copy construction into a range of uninitialized elements from a source value
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup filling
+ *  \ingroup transformations
+ *  \{
+ */
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, last)</tt> points
+ *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *  
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the range of interest.
+ *  \param last The last element of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill(thrust::device, array, array + N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill_n
+ *  \see \c fill
+ *  \see \c uninitialized_copy
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename T>
+  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                          ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, last)</tt> points
+ *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *  
+ *  \param first The first element of the range of interest.
+ *  \param last The last element of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill(array, array + N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill_n
+ *  \see \c fill
+ *  \see \c uninitialized_copy
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename ForwardIterator, typename T>
+  void uninitialized_fill(ForwardIterator first,
+                          ForwardIterator last,
+                          const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
+ *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *  
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The first element of the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *  \return <tt>first+n</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory using the \p thrust::device execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  #include <thrust/execution_policy.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill_n(thrust::device, array, N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill
+ *  \see \c fill
+ *  \see \c uninitialized_copy_n
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
+  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                                       ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+
+/*! In \c thrust, the function \c thrust::device_new allocates memory for
+ *  an object and then creates an object at that location by calling a
+ *  constructor. Occasionally, however, it is useful to separate those two
+ *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
+ *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
+ *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
+ *  calling \p ForwardIterator's \c value_type's copy constructor.
+ *  
+ *  \param first The first element of the range of interest.
+ *  \param n The size of the range of interest.
+ *  \param x The value to use as the exemplar of the copy constructor.
+ *  \return <tt>first+n</tt>
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
+ *          takes a single argument of type \p T.
+ *
+ *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
+ *  uninitialized memory.
+ *
+ *  \code
+ *  #include <thrust/uninitialized_fill.h>
+ *  #include <thrust/device_malloc.h>
+ *  
+ *  struct Int
+ *  {
+ *    __host__ __device__
+ *    Int(int x) : val(x) {}
+ *    int val;
+ *  };  
+ *  ...
+ *  const int N = 137;
+ *
+ *  Int val(46);
+ *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
+ *  thrust::uninitialized_fill_n(array, N, val);
+ *
+ *  // Int x = array[i];
+ *  // x.val == 46 for all 0 <= i < N
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see \c uninitialized_fill
+ *  \see \c fill
+ *  \see \c uninitialized_copy_n
+ *  \see \c device_new
+ *  \see \c device_malloc
+ */
+template<typename ForwardIterator, typename Size, typename T>
+  ForwardIterator uninitialized_fill_n(ForwardIterator first,
+                                       Size n,
+                                       const T &x);
+
+/*! \} // end filling
+ *  \} // transformations
+ */
+
+} // end thrust
+
+#include <thrust/detail/uninitialized_fill.inl>
+
diff --git a/compat/thrust/unique.h b/compat/thrust/unique.h
new file mode 100644
index 0000000..98150f3
--- /dev/null
+++ b/compat/thrust/unique.h
@@ -0,0 +1,960 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file unique.h
+ *  \brief Move unique elements to the front of a range
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/pair.h>
+
+namespace thrust
+{
+
+
+/*! \addtogroup stream_compaction
+ *  \{
+ */
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses \c operator== to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(thrust::host, A, A + N);
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses \c operator== to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(A, A + N);
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template <typename ForwardIterator>
+ForwardIterator unique(ForwardIterator first,
+                       ForwardIterator last);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses the function object \p binary_pred to test
+ *  for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(thrust::host, A, A + N, thrust::equal_to<int>());
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
+ *  with the same value, \p unique removes all but the first element of 
+ *  the group. The return value is an iterator \c new_last such that 
+ *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
+ *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
+ *  dereferenceable, but the elements that they point to are unspecified.
+ *  \p unique is stable, meaning that the relative order of elements that are
+ *  not removed is unchanged.
+ *
+ *  This version of \p unique uses the function object \p binary_pred to test
+ *  for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>
+ *
+ *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator is mutable,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int *new_end = thrust::unique(A, A + N, thrust::equal_to<int>());
+ *  // The first four values of A are now {1, 3, 2, 1}
+ *  // Values beyond new_end are unspecified.
+ *  \endcode
+ *
+ *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see unique_copy
+ */
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+ForwardIterator unique(ForwardIterator first,
+                       ForwardIterator last,
+                       BinaryPredicate binary_pred);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * The reason there are two different versions of unique_copy is that there
+ * are two different definitions of what it means for a consecutive group of
+ * elements to be duplicates. In the first version, the test is simple
+ * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
+ * for every iterator \p i in the range, either <tt>i == f</tt> or else 
+ * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
+ * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
+ * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
+ * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
+ *
+ * This version of \p unique_copy uses \c operator== to test for equality.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B);
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator>
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * The reason there are two different versions of unique_copy is that there
+ * are two different definitions of what it means for a consecutive group of
+ * elements to be duplicates. In the first version, the test is simple
+ * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
+ * for every iterator \p i in the range, either <tt>i == f</tt> or else 
+ * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
+ * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
+ * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
+ * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
+ *
+ * This version of \p unique_copy uses \c operator== to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(A, A + N, B);
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template <typename InputIterator,
+          typename OutputIterator>
+OutputIterator unique_copy(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result);
+
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * This version of \p unique_copy uses the function object \c binary_pred 
+ * to test for equality.
+ *
+ * The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
+ *  policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B, thrust::equal_to<int>());
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename OutputIterator,
+         typename BinaryPredicate>
+OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                           InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           BinaryPredicate binary_pred);
+                       
+
+/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
+ * to a range beginning with \p result, except that in a consecutive group
+ * of duplicate elements only the first one is copied. The return value
+ * is the end of the range to which the elements are copied. 
+ *
+ * This version of \p unique_copy uses the function object \c binary_pred 
+ * to test for equality.
+ *
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param result The beginning of the output range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[result, result_end)</tt>.
+ *
+ *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_copy to
+ *  compact a sequence of numbers to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int B[N];
+ *  int *result_end = thrust::unique_copy(A, A + N, B, thrust::equal_to<int>());
+ *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
+ *  // Values beyond result_end are unspecified.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ */
+template <typename InputIterator,
+          typename OutputIterator,
+          typename BinaryPredicate>
+OutputIterator unique_copy(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
+ *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
+ *  are equal.
+ *
+ *  This version of \p unique_by_key uses \c operator== to test for equality and 
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key(thrust::host, A, A + N, B);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
+ *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
+ *  are equal.
+ *
+ *  This version of \p unique_by_key uses \c operator== to test for equality and 
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key(A, A + N, B);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template <typename ForwardIterator1,
+          typename ForwardIterator2>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  This version of \p unique_by_key uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
+ *  execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key(thrust::host, keys, keys + N, values, binary_pred);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator1,
+         typename ForwardIterator2,
+         typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+    unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                  ForwardIterator1 keys_first, 
+                  ForwardIterator1 keys_last,
+                  ForwardIterator2 values_first,
+                  BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key removes all but the first element of 
+ *  the group.  Similarly, the corresponding values in the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
+ *  are also removed.
+ *
+ *  This version of \p unique_by_key uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the key range.
+ *  \param keys_last  The end of the key range.
+ *  \param values_first The beginning of the value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The end of the unique range <tt>[first, new_last)</tt>.
+ *
+ *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator1 is mutable,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator2 is mutable.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key to
+ *  compact a sequence of key/value pairs to remove consecutive duplicates.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key(keys, keys + N, values, binary_pred);
+ *
+ *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
+ *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
+ *  \endcode
+ *
+ *  \see unique
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key
+ */
+template <typename ForwardIterator1,
+          typename ForwardIterator2,
+          typename BinaryPredicate>
+  thrust::pair<ForwardIterator1,ForwardIterator2>
+  unique_by_key(ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_last,
+                ForwardIterator2 values_first,
+                BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_result,
+                       OutputIterator2 values_result);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
+ *  \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy for
+ *  parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template<typename DerivedPolicy,
+         typename InputIterator1,
+         typename InputIterator2,
+         typename OutputIterator1,
+         typename OutputIterator2,
+         typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                       InputIterator1 keys_first, 
+                       InputIterator1 keys_last,
+                       InputIterator2 values_first,
+                       OutputIterator1 keys_result,
+                       OutputIterator2 values_result,
+                       BinaryPredicate binary_pred);
+
+
+/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
+ *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
+ *  that are equal, \p unique_by_key_copy copies the first element of the group to
+ *  a range beginning with \c keys_result and the corresponding values from the range
+ *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
+ *  beginning with \c values_result.
+ *
+ *  This version of \p unique_by_key_copy uses the function object \c binary_pred
+ *  to test for equality and \c project1st to reduce values with equal keys.
+ *
+ *  \param keys_first The beginning of the input key range.
+ *  \param keys_last  The end of the input key range.
+ *  \param values_first The beginning of the input value range.
+ *  \param keys_result The beginning of the output key range.
+ *  \param values_result The beginning of the output value range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
+ *
+ *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
+ *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
+ *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *
+ *  \pre The input ranges shall not overlap either output range.
+ *
+ *  The following code snippet demonstrates how to use \p unique_by_key_copy to
+ *  compact a sequence of key/value pairs and with equal keys.
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
+ *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
+ *  int C[N];                         // output keys
+ *  int D[N];                         // output values
+ *
+ *  thrust::pair<int*,int*> new_end;
+ *  thrust::equal_to<int> binary_pred;
+ *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D, binary_pred);
+ *
+ *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
+ *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key
+ *  \see reduce_by_key
+ */
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+  thrust::pair<OutputIterator1,OutputIterator2>
+  unique_by_key_copy(InputIterator1 keys_first, 
+                     InputIterator1 keys_last,
+                     InputIterator2 values_first,
+                     OutputIterator1 keys_result,
+                     OutputIterator2 values_result,
+                     BinaryPredicate binary_pred);
+
+
+/*! \} // end stream_compaction
+ */
+
+
+} // end namespace thrust
+
+#include <thrust/detail/unique.inl>
+
diff --git a/compat/thrust/version.h b/compat/thrust/version.h
new file mode 100644
index 0000000..730997e
--- /dev/null
+++ b/compat/thrust/version.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2008-2012 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file version.h
+ *  \brief Compile-time macros encoding Thrust release version
+ *
+ *         <thrust/version.h> is the only Thrust header that is guaranteed to
+ *         change with every thrust release.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//  This is the only thrust header that is guaranteed to 
+//  change with every thrust release.
+//
+//  THRUST_VERSION % 100 is the sub-minor version
+//  THRUST_VERSION / 100 % 1000 is the minor version
+//  THRUST_VERSION / 100000 is the major version
+
+/*! \def THRUST_VERSION
+ *  \brief The preprocessor macro \p THRUST_VERSION encodes the version
+ *         number of the Thrust library.
+ *
+ *         <tt>THRUST_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>THRUST_VERSION / 100000</tt> is the major version.
+ */
+#define THRUST_VERSION 100700
+
+/*! \def THRUST_MAJOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
+ *         major version number of the Thrust library.
+ */
+#define THRUST_MAJOR_VERSION     (THRUST_VERSION / 100000)
+
+/*! \def THRUST_MINOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_MINOR_VERSION encodes the
+ *         minor version number of the Thrust library.
+ */
+#define THRUST_MINOR_VERSION     (THRUST_VERSION / 100 % 1000)
+
+/*! \def THRUST_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p THRUST_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the Thrust library.
+ */
+#define THRUST_SUBMINOR_VERSION  (THRUST_VERSION % 100)
+
+// Declare these namespaces here for the purpose of Doxygenating them
+
+/*! \namespace thrust
+ *  \brief \p thrust is the top-level namespace which contains all Thrust
+ *         functions and types.
+ */
+namespace thrust
+{
+
+}
+
diff --git a/compat/unistd.h b/compat/unistd.h
new file mode 100644
index 0000000..193da66
--- /dev/null
+++ b/compat/unistd.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "getopt/getopt.h"
\ No newline at end of file
diff --git a/compile b/compile
new file mode 100644
index 0000000..a49b6d0
--- /dev/null
+++ b/compile
@@ -0,0 +1,310 @@
+#! /bin/sh
+# Wrapper for compilers which do not understand '-c -o'.
+
+scriptversion=2012-01-04.17; # UTC
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free
+# Software Foundation, Inc.
+# Written by Tom Tromey <tromey@cygnus.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+nl='
+'
+
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent tools from complaining about whitespace usage.
+IFS=" ""	$nl"
+
+file_conv=
+
+# func_file_conv build_file lazy
+# Convert a $build file to $host form and store it in $file
+# Currently only supports Windows hosts. If the determined conversion
+# type is listed in (the comma separated) LAZY, no conversion will
+# take place.
+func_file_conv ()
+{
+  file=$1
+  case $file in
+    / | /[!/]*) # absolute file, and not a UNC file
+      if test -z "$file_conv"; then
+	# lazily determine how to convert abs files
+	case `uname -s` in
+	  MINGW*)
+	    file_conv=mingw
+	    ;;
+	  CYGWIN*)
+	    file_conv=cygwin
+	    ;;
+	  *)
+	    file_conv=wine
+	    ;;
+	esac
+      fi
+      case $file_conv/,$2, in
+	*,$file_conv,*)
+	  ;;
+	mingw/*)
+	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
+	  ;;
+	cygwin/*)
+	  file=`cygpath -m "$file" || echo "$file"`
+	  ;;
+	wine/*)
+	  file=`winepath -w "$file" || echo "$file"`
+	  ;;
+      esac
+      ;;
+  esac
+}
+
+# func_cl_wrapper cl arg...
+# Adjust compile command to suit cl
+func_cl_wrapper ()
+{
+  # Assume a capable shell
+  lib_path=
+  shared=:
+  linker_opts=
+  for arg
+  do
+    if test -n "$eat"; then
+      eat=
+    else
+      case $1 in
+	-o)
+	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
+	  eat=1
+	  case $2 in
+	    *.o | *.[oO][bB][jJ])
+	      func_file_conv "$2"
+	      set x "$@" -Fo"$file"
+	      shift
+	      ;;
+	    *)
+	      func_file_conv "$2"
+	      set x "$@" -Fe"$file"
+	      shift
+	      ;;
+	  esac
+	  ;;
+	-I*)
+	  func_file_conv "${1#-I}" mingw
+	  set x "$@" -I"$file"
+	  shift
+	  ;;
+	-l*)
+	  lib=${1#-l}
+	  found=no
+	  save_IFS=$IFS
+	  IFS=';'
+	  for dir in $lib_path $LIB
+	  do
+	    IFS=$save_IFS
+	    if $shared && test -f "$dir/$lib.dll.lib"; then
+	      found=yes
+	      set x "$@" "$dir/$lib.dll.lib"
+	      break
+	    fi
+	    if test -f "$dir/$lib.lib"; then
+	      found=yes
+	      set x "$@" "$dir/$lib.lib"
+	      break
+	    fi
+	  done
+	  IFS=$save_IFS
+
+	  test "$found" != yes && set x "$@" "$lib.lib"
+	  shift
+	  ;;
+	-L*)
+	  func_file_conv "${1#-L}"
+	  if test -z "$lib_path"; then
+	    lib_path=$file
+	  else
+	    lib_path="$lib_path;$file"
+	  fi
+	  linker_opts="$linker_opts -LIBPATH:$file"
+	  ;;
+	-static)
+	  shared=false
+	  ;;
+	-Wl,*)
+	  arg=${1#-Wl,}
+	  save_ifs="$IFS"; IFS=','
+	  for flag in $arg; do
+	    IFS="$save_ifs"
+	    linker_opts="$linker_opts $flag"
+	  done
+	  IFS="$save_ifs"
+	  ;;
+	-Xlinker)
+	  eat=1
+	  linker_opts="$linker_opts $2"
+	  ;;
+	-*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
+	  func_file_conv "$1"
+	  set x "$@" -Tp"$file"
+	  shift
+	  ;;
+	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
+	  func_file_conv "$1" mingw
+	  set x "$@" "$file"
+	  shift
+	  ;;
+	*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+      esac
+    fi
+    shift
+  done
+  if test -n "$linker_opts"; then
+    linker_opts="-link$linker_opts"
+  fi
+  exec "$@" $linker_opts
+  exit 1
+}
+
+eat=
+
+case $1 in
+  '')
+     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: compile [--help] [--version] PROGRAM [ARGS]
+
+Wrapper for compilers which do not understand '-c -o'.
+Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
+arguments, and rename the output as expected.
+
+If you are trying to build a whole package this is not the
+right script to run: please start by reading the file 'INSTALL'.
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "compile $scriptversion"
+    exit $?
+    ;;
+  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
+    func_cl_wrapper "$@"      # Doesn't return...
+    ;;
+esac
+
+ofile=
+cfile=
+
+for arg
+do
+  if test -n "$eat"; then
+    eat=
+  else
+    case $1 in
+      -o)
+	# configure might choose to run compile as 'compile cc -o foo foo.c'.
+	# So we strip '-o arg' only if arg is an object.
+	eat=1
+	case $2 in
+	  *.o | *.obj)
+	    ofile=$2
+	    ;;
+	  *)
+	    set x "$@" -o "$2"
+	    shift
+	    ;;
+	esac
+	;;
+      *.c)
+	cfile=$1
+	set x "$@" "$1"
+	shift
+	;;
+      *)
+	set x "$@" "$1"
+	shift
+	;;
+    esac
+  fi
+  shift
+done
+
+if test -z "$ofile" || test -z "$cfile"; then
+  # If no '-o' option was seen then we might have been invoked from a
+  # pattern rule where we don't need one.  That is ok -- this is a
+  # normal compilation that the losing compiler can handle.  If no
+  # '.c' file was seen then we are probably linking.  That is also
+  # ok.
+  exec "$@"
+fi
+
+# Name of file we expect compiler to create.
+cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
+
+# Create the lock directory.
+# Note: use '[/\\:.-]' here to ensure that we don't use the same name
+# that we are using for the .o file.  Also, base the name on the expected
+# object file name, since that is what matters with a parallel build.
+lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
+while true; do
+  if mkdir "$lockdir" >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+# FIXME: race condition here if user kills between mkdir and trap.
+trap "rmdir '$lockdir'; exit 1" 1 2 15
+
+# Run the compile.
+"$@"
+ret=$?
+
+if test -f "$cofile"; then
+  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
+elif test -f "${cofile}bj"; then
+  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
+fi
+
+rmdir "$lockdir"
+exit $ret
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/config.guess b/config.guess
new file mode 100644
index 0000000..f32079a
--- /dev/null
+++ b/config.guess
@@ -0,0 +1,1526 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+#   Free Software Foundation, Inc.
+
+timestamp='2008-01-23'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[456])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	case ${UNAME_MACHINE} in
+	    pc98)
+		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    *:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:windows32*:*)
+    	# uname -m includes "-pc" on this system.
+    	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    *:Interix*:[3456]*)
+    	case ${UNAME_MACHINE} in
+	    x86)
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    EM64T | authenticamd)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    arm*:Linux:*:*)
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	else
+	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	fi
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+    	echo frv-unknown-linux-gnu
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit ;;
+    xtensa*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit ;;
+	esac
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^LIBC/{
+		s: ::g
+		p
+	    }'`"
+	test x"${LIBC}" != x && {
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+		exit
+	}
+	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && { echo i486-ncr-sysv4; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/config.sub b/config.sub
new file mode 100644
index 0000000..6759825
--- /dev/null
+++ b/config.sub
@@ -0,0 +1,1658 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+#   Free Software Foundation, Inc.
+
+timestamp='2008-01-16'
+
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
+  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis | -knuth | -cray)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+ 	-chorusrdb)
+ 		os=-chorusrdb
+		basic_machine=$1
+ 		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	1750a | 580 \
+	| a29k \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| am33_2.0 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+	| bfin \
+	| c4x | clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| fido | fr30 | frv \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| i370 | i860 | i960 | ia64 \
+	| ip2k | iq2000 \
+	| m32c | m32r | m32rle | m68000 | m68k | m88k \
+	| maxq | mb | microblaze | mcore | mep \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64vr | mips64vrel \
+	| mips64orion | mips64orionel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| mt \
+	| msp430 \
+	| nios | nios2 \
+	| ns16k | ns32k \
+	| or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
+	| pyramid \
+	| score \
+	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+	| spu | strongarm \
+	| tahoe | thumb | tic4x | tic80 | tron \
+	| v850 | v850e \
+	| we32k \
+	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
+	| z8k)
+		basic_machine=$basic_machine-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12)
+		# Motorola 68HC11/12.
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	580-* \
+	| a29k-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* | avr32-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
+	| clipper-* | craynv-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| elxsi-* \
+	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| i*86-* | i860-* | i960-* | ia64-* \
+	| ip2k-* | iq2000-* \
+	| m32c-* | m32r-* | m32rle-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
+	| msp430-* \
+	| nios-* | nios2-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
+	| pyramid-* \
+	| romp-* | rs6000-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
+	| tahoe-* | thumb-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tron-* \
+	| v850-* | v850e-* | vax-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
+	| xstormy16-* | xtensa*-* \
+	| ymp-* \
+	| z8k-*)
+		;;
+	# Recognize the basic CPU types without company name, with glob match.
+	xtensa*)
+		basic_machine=$basic_machine-unknown
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+    	abacus)
+		basic_machine=abacus-unknown
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	blackfin)
+		basic_machine=bfin-unknown
+		os=-linux
+		;;
+	blackfin-*)
+		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | j90)
+		basic_machine=j90-cray
+		os=-unicos
+		;;
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
+		;;
+	cr16)
+		basic_machine=cr16-unknown
+		os=-elf
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	m68knommu)
+		basic_machine=m68k-unknown
+		os=-linux
+		;;
+	m68knommu-*)
+		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
+	mingw32ce)
+		basic_machine=arm-unknown
+		os=-mingw32ce
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	openrisc | openrisc-*)
+		basic_machine=or32-unknown
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	parisc)
+		basic_machine=hppa-unknown
+		os=-linux
+		;;
+	parisc-*)
+		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+	pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pc98)
+		basic_machine=i386-pc
+		;;
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2 | pentiumiii | pentium3)
+		basic_machine=i686-pc
+		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+		;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+		;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rdos)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sde)
+		basic_machine=mipsisa32-sde
+		os=-elf
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sh5el)
+		basic_machine=sh5le-unknown
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+	tic54x | c54x*)
+		basic_machine=tic54x-unknown
+		os=-coff
+		;;
+	tic55x | c55x*)
+		basic_machine=tic55x-unknown
+		os=-coff
+		;;
+	tic6x | c6x*)
+		basic_machine=tic6x-unknown
+		os=-coff
+		;;
+	tile*)
+		basic_machine=tile-unknown
+		os=-linux-gnu
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+		basic_machine=f301-fujitsu
+		;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
+		basic_machine=sparc-sun
+		;;
+	cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -openbsd* | -solidbsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -chorusos* | -chorusrdb* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+        -os400*)
+		os=-os400
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-syllable*)
+		os=-syllable
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
+	-ns2 )
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+        -tpf*)
+		os=-tpf
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-kaos*)
+		os=-kaos
+		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+        score-*)
+		os=-elf
+		;;
+        spu-*)
+		os=-elf
+		;;
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+        c4x-* | tic4x-*)
+        	os=-coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+        mep-*)
+		os=-elf
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-haiku)
+		os=-haiku
+		;;
+	*-ibm)
+		os=-aix
+		;;
+    	*-knuth)
+		os=-mmixware
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+	*-gould)
+		os=-sysv
+		;;
+	*-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+	*-sgi)
+		os=-irix
+		;;
+	*-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-os400*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-tpf*)
+				vendor=ibm
+				;;
+			-vxsim* | -vxworks* | -windiss*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/configure b/configure
new file mode 100755
index 0000000..30ecab4
--- /dev/null
+++ b/configure
@@ -0,0 +1,8127 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.69 for ccminer 2014.03.18.
+#
+#
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+  exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1"
+  if (eval "$as_required") 2>/dev/null; then :
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir/$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+      if test "x$CONFIG_SHELL" != x; then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno; then :
+  $as_echo "$0: This script requires a shell more modern than all"
+  $as_echo "$0: the shells that I found on your system."
+  if test x${ZSH_VERSION+set} = xset ; then
+    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    $as_echo "$0: Please tell bug-autoconf@gnu.org about your system,
+$0: including any error possibly output before this
+$0: message. Then install a modern shell, or manually run
+$0: the script under such a shell if you do have one."
+  fi
+  exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='ccminer'
+PACKAGE_TARNAME='ccminer'
+PACKAGE_VERSION='2014.03.18'
+PACKAGE_STRING='ccminer 2014.03.18'
+PACKAGE_BUGREPORT=''
+PACKAGE_URL=''
+
+ac_unique_file="cpu-miner.c"
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_subst_vars='am__EXEEXT_FALSE
+am__EXEEXT_TRUE
+LTLIBOBJS
+LIBOBJS
+NVCC
+CUDA_LDFLAGS
+CUDA_LIBS
+CUDA_CFLAGS
+WS2_LIBS
+PTHREAD_LIBS
+PTHREAD_FLAGS
+JANSSON_LIBS
+LIBCURL
+LIBCURL_CPPFLAGS
+_libcurl_config
+ARCH_x86_64_FALSE
+ARCH_x86_64_TRUE
+ARCH_x86_FALSE
+ARCH_x86_TRUE
+HAVE_WINDOWS_FALSE
+HAVE_WINDOWS_TRUE
+WANT_JANSSON_FALSE
+WANT_JANSSON_TRUE
+ALLOCA
+OPENMP_CFLAGS
+am__fastdepCXX_FALSE
+am__fastdepCXX_TRUE
+CXXDEPMODE
+ac_ct_CXX
+CXXFLAGS
+CXX
+RANLIB
+am__fastdepCCAS_FALSE
+am__fastdepCCAS_TRUE
+CCASDEPMODE
+CCASFLAGS
+CCAS
+EGREP
+GREP
+CPP
+am__fastdepCC_FALSE
+am__fastdepCC_TRUE
+CCDEPMODE
+am__nodep
+AMDEPBACKSLASH
+AMDEP_FALSE
+AMDEP_TRUE
+am__quote
+am__include
+DEPDIR
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+MAINT
+MAINTAINER_MODE_FALSE
+MAINTAINER_MODE_TRUE
+AM_BACKSLASH
+AM_DEFAULT_VERBOSITY
+AM_DEFAULT_V
+AM_V
+am__untar
+am__tar
+AMTAR
+am__leading_dot
+SET_MAKE
+AWK
+mkdir_p
+MKDIR_P
+INSTALL_STRIP_PROGRAM
+STRIP
+install_sh
+MAKEINFO
+AUTOHEADER
+AUTOMAKE
+AUTOCONF
+ACLOCAL
+VERSION
+PACKAGE
+CYGPATH_W
+am__isrc
+INSTALL_DATA
+INSTALL_SCRIPT
+INSTALL_PROGRAM
+target_os
+target_vendor
+target_cpu
+target
+host_os
+host_vendor
+host_cpu
+host
+build_os
+build_vendor
+build_cpu
+build
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+enable_silent_rules
+enable_maintainer_mode
+enable_dependency_tracking
+enable_openmp
+with_libcurl
+with_cuda
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CPP
+CCAS
+CCASFLAGS
+CXX
+CXXFLAGS
+CCC'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures ccminer 2014.03.18 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/ccminer]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+
+Program names:
+  --program-prefix=PREFIX            prepend PREFIX to installed program names
+  --program-suffix=SUFFIX            append SUFFIX to installed program names
+  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
+
+System types:
+  --build=BUILD     configure for building on BUILD [guessed]
+  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
+  --target=TARGET   configure for building compilers for TARGET [HOST]
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of ccminer 2014.03.18:";;
+   esac
+  cat <<\_ACEOF
+
+Optional Features:
+  --disable-option-checking  ignore unrecognized --enable/--with options
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-silent-rules   less verbose build output (undo: "make V=1")
+  --disable-silent-rules  verbose build output (undo: "make V=0")
+  --enable-maintainer-mode
+                          enable make rules and dependencies not useful (and
+                          sometimes confusing) to the casual installer
+  --enable-dependency-tracking
+                          do not reject slow dependency extractors
+  --disable-dependency-tracking
+                          speeds up one-time build
+  --disable-openmp        do not use OpenMP
+
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-libcurl=PREFIX   look for the curl library in PREFIX/lib and headers
+                          in PREFIX/include
+  --with-cuda=PATH    prefix where cuda is installed default=/usr/local/cuda
+
+Some influential environment variables:
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CPP         C preprocessor
+  CCAS        assembler compiler command (defaults to CC)
+  CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to the package provider.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+ccminer configure 2014.03.18
+generated by GNU Autoconf 2.69
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_try_cpp LINENO
+# ----------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_cpp
+
+# ac_fn_cxx_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_compile
+
+# ac_fn_c_try_link LINENO
+# -----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_link ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest$ac_exeext
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 test -x conftest$ac_exeext
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
+  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
+  # interfere with the next link command; also delete a directory that is
+  # left behind by Apple's compiler.  We do this before executing the actions.
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_link
+
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_c_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_run
+
+# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_c_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_mongrel
+
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+
+# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
+# ---------------------------------------------
+# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
+# accordingly.
+ac_fn_c_check_decl ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  as_decl_name=`echo $2|sed 's/ *(.*//'`
+  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
+$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+#ifndef $as_decl_name
+#ifdef __cplusplus
+  (void) $as_decl_use;
+#else
+  (void) $as_decl_name;
+#endif
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_decl
+
+# ac_fn_c_check_type LINENO TYPE VAR INCLUDES
+# -------------------------------------------
+# Tests whether TYPE exists after having included INCLUDES, setting cache
+# variable VAR accordingly.
+ac_fn_c_check_type ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=no"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof ($2))
+	 return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+if (sizeof (($2)))
+	    return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  eval "$3=yes"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_type
+
+# ac_fn_c_check_func LINENO FUNC VAR
+# ----------------------------------
+# Tests whether FUNC exists, setting the cache variable VAR accordingly
+ac_fn_c_check_func ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $2 innocuous_$2
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $2 (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $2
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $2 ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$2 || defined __stub___$2
+choke me
+#endif
+
+int
+main ()
+{
+return $2 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_func
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by ccminer $as_me 2014.03.18, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    $as_echo "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      $as_echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	$as_echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      $as_echo "$as_me: caught signal $ac_signal"
+    $as_echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+  # We do not want a PATH search for config.site.
+  case $CONFIG_SITE in #((
+    -*)  ac_site_file1=./$CONFIG_SITE;;
+    */*) ac_site_file1=$CONFIG_SITE;;
+    *)   ac_site_file1=./$CONFIG_SITE;;
+  esac
+elif test "x$prefix" != xNONE; then
+  ac_site_file1=$prefix/share/config.site
+  ac_site_file2=$prefix/etc/config.site
+else
+  ac_site_file1=$ac_default_prefix/share/config.site
+  ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+  test "x$ac_site_file" = xNONE && continue
+  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
+ac_aux_dir=
+for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
+fi
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
+
+
+# Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
+$as_echo_n "checking build system type... " >&6; }
+if ${ac_cv_build+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
+$as_echo "$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
+$as_echo_n "checking host system type... " >&6; }
+if ${ac_cv_host+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
+$as_echo "$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
+$as_echo_n "checking target system type... " >&6; }
+if ${ac_cv_target+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "x$target_alias" = x; then
+  ac_cv_target=$ac_cv_host
+else
+  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
+    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
+$as_echo "$ac_cv_target" >&6; }
+case $ac_cv_target in
+*-*-*) ;;
+*) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;;
+esac
+target=$ac_cv_target
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_target
+shift
+target_cpu=$1
+target_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+target_os=$*
+IFS=$ac_save_IFS
+case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
+
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+test -n "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+
+am__api_version='1.13'
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AmigaOS /C/install, which installs bootblocks on floppy discs
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# OS/2's system install, which has a completely different semantic
+# ./install, which can be erroneously created by make from ./install.sh.
+# Reject install programs that cannot install multiple files.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
+$as_echo_n "checking for a BSD-compatible install... " >&6; }
+if test -z "$INSTALL"; then
+if ${ac_cv_path_install+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    # Account for people who put trailing slashes in PATH elements.
+case $as_dir/ in #((
+  ./ | .// | /[cC]/* | \
+  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
+  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
+  /usr/ucb/* ) ;;
+  *)
+    # OSF1 and SCO ODT 3.0 have their own names for install.
+    # Don't use installbsd from OSF since it installs stuff as root
+    # by default.
+    for ac_prog in ginstall scoinst install; do
+      for ac_exec_ext in '' $ac_executable_extensions; do
+	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+	  if test $ac_prog = install &&
+	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    rm -rf conftest.one conftest.two conftest.dir
+	    echo one > conftest.one
+	    echo two > conftest.two
+	    mkdir conftest.dir
+	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
+	      test -s conftest.one && test -s conftest.two &&
+	      test -s conftest.dir/conftest.one &&
+	      test -s conftest.dir/conftest.two
+	    then
+	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
+	      break 3
+	    fi
+	  fi
+	fi
+      done
+    done
+    ;;
+esac
+
+  done
+IFS=$as_save_IFS
+
+rm -rf conftest.one conftest.two conftest.dir
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL=$ac_cv_path_install
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    INSTALL=$ac_install_sh
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
+$as_echo "$INSTALL" >&6; }
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
+$as_echo_n "checking whether build environment is sane... " >&6; }
+# Reject unsafe characters in $srcdir or the absolute working directory
+# name.  Accept space and tab only in the latter.
+am_lf='
+'
+case `pwd` in
+  *[\\\"\#\$\&\'\`$am_lf]*)
+    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
+esac
+case $srcdir in
+  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
+    as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
+esac
+
+# Do 'set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   am_has_slept=no
+   for am_try in 1 2; do
+     echo "timestamp, slept: $am_has_slept" > conftest.file
+     set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
+     if test "$*" = "X"; then
+	# -L didn't work.
+	set X `ls -t "$srcdir/configure" conftest.file`
+     fi
+     if test "$*" != "X $srcdir/configure conftest.file" \
+	&& test "$*" != "X conftest.file $srcdir/configure"; then
+
+	# If neither matched, then we have a broken ls.  This can happen
+	# if, for instance, CONFIG_SHELL is bash and it inherits a
+	# broken ls alias from the environment.  This has actually
+	# happened.  Such a system could not be considered "sane".
+	as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
+  alias in your environment" "$LINENO" 5
+     fi
+     if test "$2" = conftest.file || test $am_try -eq 2; then
+       break
+     fi
+     # Just in case.
+     sleep 1
+     am_has_slept=yes
+   done
+   test "$2" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   as_fn_error $? "newly created file is older than distributed files!
+Check your system clock" "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+# If we didn't sleep, we still need to ensure time stamps of config.status and
+# generated files are strictly newer.
+am_sleep_pid=
+if grep 'slept: no' conftest.file >/dev/null 2>&1; then
+  ( sleep 1 ) &
+  am_sleep_pid=$!
+fi
+
+rm -f conftest.file
+
+test "$program_prefix" != NONE &&
+  program_transform_name="s&^&$program_prefix&;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
+# Double any \ or $.
+# By default was `s,x,x', remove it if useless.
+ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
+program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
+
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+
+if test x"${MISSING+set}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
+fi
+# Use eval to expand $SHELL
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
+else
+  am_missing_run=
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5
+$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
+fi
+
+if test x"${install_sh}" != xset; then
+  case $am_aux_dir in
+  *\ * | *\	*)
+    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
+  *)
+    install_sh="\${SHELL} $am_aux_dir/install-sh"
+  esac
+fi
+
+# Installed binaries are usually stripped using 'strip' when the user
+# run "make install-strip".  However 'strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the 'STRIP' environment variable to overrule this program.
+if test "$cross_compiling" != no; then
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+STRIP=$ac_cv_prog_STRIP
+if test -n "$STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
+$as_echo "$STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_STRIP"; then
+  ac_ct_STRIP=$STRIP
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_STRIP"; then
+  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_STRIP="strip"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
+if test -n "$ac_ct_STRIP"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
+$as_echo "$ac_ct_STRIP" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_STRIP" = x; then
+    STRIP=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    STRIP=$ac_ct_STRIP
+  fi
+else
+  STRIP="$ac_cv_prog_STRIP"
+fi
+
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
+$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
+if test -z "$MKDIR_P"; then
+  if ${ac_cv_path_mkdir+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext" || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done
+  done
+IFS=$as_save_IFS
+
+fi
+
+  test -d ./--version && rmdir ./--version
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
+$as_echo "$MKDIR_P" >&6; }
+
+for ac_prog in gawk mawk nawk awk
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AWK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AWK="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AWK=$ac_cv_prog_AWK
+if test -n "$AWK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
+$as_echo "$AWK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AWK" && break
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
+set x ${MAKE-make}
+ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
+_ACEOF
+# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
+fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+  SET_MAKE=
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+
+# Check whether --enable-silent-rules was given.
+if test "${enable_silent_rules+set}" = set; then :
+  enableval=$enable_silent_rules;
+fi
+
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=1;;
+esac
+am_make=${MAKE-make}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
+$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
+if ${am_cv_make_support_nested_variables+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if $as_echo 'TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
+$as_echo "$am_cv_make_support_nested_variables" >&6; }
+if test $am_cv_make_support_nested_variables = yes; then
+    AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AM_BACKSLASH='\'
+
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  am__isrc=' -I$(srcdir)'
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+
+
+# Define the identity of the package.
+ PACKAGE='ccminer'
+ VERSION='2014.03.18'
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE "$PACKAGE"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define VERSION "$VERSION"
+_ACEOF
+
+# Some tools Automake needs.
+
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
+
+
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+# For better backward compatibility.  To be removed once Automake 1.9.x
+# dies out for good.  For more background, see:
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
+# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
+mkdir_p='$(MKDIR_P)'
+
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AMTAR='$${TAR-tar}'
+
+
+# We'll loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar  pax cpio none'
+
+am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
+
+
+
+
+
+
+ac_config_headers="$ac_config_headers cpuminer-config.h"
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
+$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
+    # Check whether --enable-maintainer-mode was given.
+if test "${enable_maintainer_mode+set}" = set; then :
+  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
+else
+  USE_MAINTAINER_MODE=no
+fi
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
+$as_echo "$USE_MAINTAINER_MODE" >&6; }
+   if test $USE_MAINTAINER_MODE = yes; then
+  MAINTAINER_MODE_TRUE=
+  MAINTAINER_MODE_FALSE='#'
+else
+  MAINTAINER_MODE_TRUE='#'
+  MAINTAINER_MODE_FALSE=
+fi
+
+  MAINT=$MAINTAINER_MODE_TRUE
+
+
+
+DEPDIR="${am__leading_dot}deps"
+
+ac_config_commands="$ac_config_commands depfiles"
+
+
+am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
+$as_echo_n "checking for style of include used by $am_make... " >&6; }
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from 'make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
+$as_echo "$_am_result" >&6; }
+rm -f confinc confmf
+
+# Check whether --enable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then :
+  enableval=$enable_dependency_tracking;
+fi
+
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+ if test "x$enable_dependency_tracking" != xno; then
+  AMDEP_TRUE=
+  AMDEP_FALSE='#'
+else
+  AMDEP_TRUE='#'
+  AMDEP_FALSE=
+fi
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+else
+  CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  fi
+fi
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+if test -z "$ac_file"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+depcc="$CC"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CC_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CC_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+  case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CC_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CC_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
+CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
+  am__fastdepCC_TRUE=
+  am__fastdepCC_FALSE='#'
+else
+  am__fastdepCC_TRUE='#'
+  am__fastdepCC_FALSE=
+fi
+
+
+   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5
+$as_echo_n "checking for $CC option to accept ISO C99... " >&6; }
+if ${ac_cv_prog_cc_c99+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c99=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <stdio.h>
+
+// Check varargs macros.  These examples are taken from C99 6.10.3.5.
+#define debug(...) fprintf (stderr, __VA_ARGS__)
+#define showlist(...) puts (#__VA_ARGS__)
+#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__))
+static void
+test_varargs_macros (void)
+{
+  int x = 1234;
+  int y = 5678;
+  debug ("Flag");
+  debug ("X = %d\n", x);
+  showlist (The first, second, and third items.);
+  report (x>y, "x is %d but y is %d", x, y);
+}
+
+// Check long long types.
+#define BIG64 18446744073709551615ull
+#define BIG32 4294967295ul
+#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0)
+#if !BIG_OK
+  your preprocessor is broken;
+#endif
+#if BIG_OK
+#else
+  your preprocessor is broken;
+#endif
+static long long int bignum = -9223372036854775807LL;
+static unsigned long long int ubignum = BIG64;
+
+struct incomplete_array
+{
+  int datasize;
+  double data[];
+};
+
+struct named_init {
+  int number;
+  const wchar_t *name;
+  double average;
+};
+
+typedef const char *ccp;
+
+static inline int
+test_restrict (ccp restrict text)
+{
+  // See if C++-style comments work.
+  // Iterate through items via the restricted pointer.
+  // Also check for declarations in for loops.
+  for (unsigned int i = 0; *(text+i) != '\0'; ++i)
+    continue;
+  return 0;
+}
+
+// Check varargs and va_copy.
+static void
+test_varargs (const char *format, ...)
+{
+  va_list args;
+  va_start (args, format);
+  va_list args_copy;
+  va_copy (args_copy, args);
+
+  const char *str;
+  int number;
+  float fnumber;
+
+  while (*format)
+    {
+      switch (*format++)
+	{
+	case 's': // string
+	  str = va_arg (args_copy, const char *);
+	  break;
+	case 'd': // int
+	  number = va_arg (args_copy, int);
+	  break;
+	case 'f': // float
+	  fnumber = va_arg (args_copy, double);
+	  break;
+	default:
+	  break;
+	}
+    }
+  va_end (args_copy);
+  va_end (args);
+}
+
+int
+main ()
+{
+
+  // Check bool.
+  _Bool success = false;
+
+  // Check restrict.
+  if (test_restrict ("String literal") == 0)
+    success = true;
+  char *restrict newvar = "Another string";
+
+  // Check varargs.
+  test_varargs ("s, d' f .", "string", 65, 34.234);
+  test_varargs_macros ();
+
+  // Check flexible array members.
+  struct incomplete_array *ia =
+    malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10));
+  ia->datasize = 10;
+  for (int i = 0; i < ia->datasize; ++i)
+    ia->data[i] = i * 1.234;
+
+  // Check named initializers.
+  struct named_init ni = {
+    .number = 34,
+    .name = L"Test wide string",
+    .average = 543.34343,
+  };
+
+  ni.number = 58;
+
+  int dynamic_array[ni.number];
+  dynamic_array[ni.number - 1] = 543;
+
+  // work around unused variable warnings
+  return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x'
+	  || dynamic_array[ni.number - 1] != 543);
+
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c99=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c99" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c99" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c99"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5
+$as_echo "$ac_cv_prog_cc_c99" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c99" != xno; then :
+
+fi
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+if test $ac_cv_c_compiler_gnu = yes; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC needs -traditional" >&5
+$as_echo_n "checking whether $CC needs -traditional... " >&6; }
+if ${ac_cv_prog_gcc_traditional+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+    ac_pattern="Autoconf.*'x'"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sgtty.h>
+Autoconf TIOCGETP
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "$ac_pattern" >/dev/null 2>&1; then :
+  ac_cv_prog_gcc_traditional=yes
+else
+  ac_cv_prog_gcc_traditional=no
+fi
+rm -f conftest*
+
+
+  if test $ac_cv_prog_gcc_traditional = no; then
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <termio.h>
+Autoconf TCGETA
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "$ac_pattern" >/dev/null 2>&1; then :
+  ac_cv_prog_gcc_traditional=yes
+fi
+rm -f conftest*
+
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_gcc_traditional" >&5
+$as_echo "$ac_cv_prog_gcc_traditional" >&6; }
+  if test $ac_cv_prog_gcc_traditional = yes; then
+    CC="$CC -traditional"
+  fi
+fi
+
+if test "x$CC" != xcc; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC and cc understand -c and -o together" >&5
+$as_echo_n "checking whether $CC and cc understand -c and -o together... " >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether cc understands -c and -o together" >&5
+$as_echo_n "checking whether cc understands -c and -o together... " >&6; }
+fi
+set dummy $CC; ac_cc=`$as_echo "$2" |
+		      sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
+if eval \${ac_cv_prog_cc_${ac_cc}_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+# Make sure it works both with $CC and with simple cc.
+# We do the test twice because some compilers refuse to overwrite an
+# existing .o file with -o, though they will create one.
+ac_try='$CC -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
+rm -f conftest2.*
+if { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } &&
+   test -f conftest2.$ac_objext && { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; };
+then
+  eval ac_cv_prog_cc_${ac_cc}_c_o=yes
+  if test "x$CC" != xcc; then
+    # Test first that cc exists at all.
+    if { ac_try='cc -c conftest.$ac_ext >&5'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+      ac_try='cc -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
+      rm -f conftest2.*
+      if { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } &&
+	 test -f conftest2.$ac_objext && { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; };
+      then
+	# cc works too.
+	:
+      else
+	# cc exists but doesn't like -o.
+	eval ac_cv_prog_cc_${ac_cc}_c_o=no
+      fi
+    fi
+  fi
+else
+  eval ac_cv_prog_cc_${ac_cc}_c_o=no
+fi
+rm -f core conftest*
+
+fi
+if eval test \$ac_cv_prog_cc_${ac_cc}_c_o = yes; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+$as_echo "#define NO_MINUS_C_MINUS_O 1" >>confdefs.h
+
+fi
+
+# FIXME: we rely on the cache variable name because
+# there is no other way.
+set dummy $CC
+am_cc=`echo $2 | sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
+eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
+if test "$am_t" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+
+
+# By default we simply use the C compiler to build assembly code.
+
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+
+
+
+depcc="$CCAS"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CCAS_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CCAS_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CCAS_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
+CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
+  am__fastdepCCAS_TRUE=
+  am__fastdepCCAS_FALSE='#'
+else
+  am__fastdepCCAS_TRUE='#'
+  am__fastdepCCAS_FALSE=
+fi
+
+
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_RANLIB+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+RANLIB=$ac_cv_prog_RANLIB
+if test -n "$RANLIB"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
+$as_echo "$RANLIB" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_RANLIB"; then
+  ac_ct_RANLIB=$RANLIB
+  # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_RANLIB+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_RANLIB"; then
+  ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_RANLIB="ranlib"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
+if test -n "$ac_ct_RANLIB"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
+$as_echo "$ac_ct_RANLIB" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_RANLIB" = x; then
+    RANLIB=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    RANLIB=$ac_ct_RANLIB
+  fi
+else
+  RANLIB="$ac_cv_prog_RANLIB"
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+$as_echo "$ac_ct_CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
+$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
+if ${ac_cv_cxx_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+$as_echo_n "checking whether $CXX accepts -g... " >&6; }
+if ${ac_cv_prog_cxx_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+else
+  CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+else
+  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+$as_echo "$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+depcc="$CXX"  am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CXX_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+  case " $depcc " in #(
+     *\ -arch\ *\ -arch\ *) am__universal=true ;;
+     esac
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CXX_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CXX_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
+CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
+  am__fastdepCXX_TRUE=
+  am__fastdepCXX_FALSE='#'
+else
+  am__fastdepCXX_TRUE='#'
+  am__fastdepCXX_FALSE=
+fi
+
+
+
+  OPENMP_CFLAGS=
+  # Check whether --enable-openmp was given.
+if test "${enable_openmp+set}" = set; then :
+  enableval=$enable_openmp;
+fi
+
+  if test "$enable_openmp" != no; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to support OpenMP" >&5
+$as_echo_n "checking for $CC option to support OpenMP... " >&6; }
+if ${ac_cv_prog_c_openmp+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifndef _OPENMP
+ choke me
+#endif
+#include <omp.h>
+int main () { return omp_get_num_threads (); }
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_prog_c_openmp='none needed'
+else
+  ac_cv_prog_c_openmp='unsupported'
+	  	  	  	  	  	  	                                	  	  	  	  	  	  for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \
+                           -Popenmp --openmp; do
+	    ac_save_CFLAGS=$CFLAGS
+	    CFLAGS="$CFLAGS $ac_option"
+	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#ifndef _OPENMP
+ choke me
+#endif
+#include <omp.h>
+int main () { return omp_get_num_threads (); }
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_prog_c_openmp=$ac_option
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+	    CFLAGS=$ac_save_CFLAGS
+	    if test "$ac_cv_prog_c_openmp" != unsupported; then
+	      break
+	    fi
+	  done
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_c_openmp" >&5
+$as_echo "$ac_cv_prog_c_openmp" >&6; }
+    case $ac_cv_prog_c_openmp in #(
+      "none needed" | unsupported)
+	;; #(
+      *)
+	OPENMP_CFLAGS=$ac_cv_prog_c_openmp ;;
+    esac
+  fi
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+for ac_header in sys/endian.h sys/param.h syslog.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+# sys/sysctl.h requires sys/types.h on FreeBSD
+# sys/sysctl.h requires sys/param.h on OpenBSD
+for ac_header in sys/sysctl.h
+do :
+  ac_fn_c_check_header_compile "$LINENO" "sys/sysctl.h" "ac_cv_header_sys_sysctl_h" "#include <sys/types.h>
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+"
+if test "x$ac_cv_header_sys_sysctl_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_SYS_SYSCTL_H 1
+_ACEOF
+
+fi
+
+done
+
+
+ac_fn_c_check_decl "$LINENO" "be32dec" "ac_cv_have_decl_be32dec" "$ac_includes_default
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+"
+if test "x$ac_cv_have_decl_be32dec" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_BE32DEC $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "le32dec" "ac_cv_have_decl_le32dec" "$ac_includes_default
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+"
+if test "x$ac_cv_have_decl_le32dec" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_LE32DEC $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "be32enc" "ac_cv_have_decl_be32enc" "$ac_includes_default
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+"
+if test "x$ac_cv_have_decl_be32enc" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_BE32ENC $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "le32enc" "ac_cv_have_decl_le32enc" "$ac_includes_default
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+"
+if test "x$ac_cv_have_decl_le32enc" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_LE32ENC $ac_have_decl
+_ACEOF
+
+
+ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default"
+if test "x$ac_cv_type_size_t" = xyes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define size_t unsigned int
+_ACEOF
+
+fi
+
+# The Ultrix 4.2 mips builtin alloca declared by alloca.h only works
+# for constant arguments.  Useless!
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5
+$as_echo_n "checking for working alloca.h... " >&6; }
+if ${ac_cv_working_alloca_h+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <alloca.h>
+int
+main ()
+{
+char *p = (char *) alloca (2 * sizeof (int));
+			  if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_working_alloca_h=yes
+else
+  ac_cv_working_alloca_h=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5
+$as_echo "$ac_cv_working_alloca_h" >&6; }
+if test $ac_cv_working_alloca_h = yes; then
+
+$as_echo "#define HAVE_ALLOCA_H 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5
+$as_echo_n "checking for alloca... " >&6; }
+if ${ac_cv_func_alloca_works+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __GNUC__
+# define alloca __builtin_alloca
+#else
+# ifdef _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# else
+#  ifdef HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef _AIX
+ #pragma alloca
+#   else
+#    ifndef alloca /* predefined by HP cc +Olibcalls */
+void *alloca (size_t);
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+int
+main ()
+{
+char *p = (char *) alloca (1);
+				    if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_func_alloca_works=yes
+else
+  ac_cv_func_alloca_works=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5
+$as_echo "$ac_cv_func_alloca_works" >&6; }
+
+if test $ac_cv_func_alloca_works = yes; then
+
+$as_echo "#define HAVE_ALLOCA 1" >>confdefs.h
+
+else
+  # The SVR3 libPW and SVR4 libucb both contain incompatible functions
+# that cause trouble.  Some versions do not even contain alloca or
+# contain a buggy version.  If you still want to use their alloca,
+# use ar to extract alloca.o from them instead of compiling alloca.c.
+
+ALLOCA=\${LIBOBJDIR}alloca.$ac_objext
+
+$as_echo "#define C_ALLOCA 1" >>confdefs.h
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether \`alloca.c' needs Cray hooks" >&5
+$as_echo_n "checking whether \`alloca.c' needs Cray hooks... " >&6; }
+if ${ac_cv_os_cray+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if defined CRAY && ! defined CRAY2
+webecray
+#else
+wenotbecray
+#endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "webecray" >/dev/null 2>&1; then :
+  ac_cv_os_cray=yes
+else
+  ac_cv_os_cray=no
+fi
+rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_os_cray" >&5
+$as_echo "$ac_cv_os_cray" >&6; }
+if test $ac_cv_os_cray = yes; then
+  for ac_func in _getb67 GETB67 getb67; do
+    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define CRAY_STACKSEG_END $ac_func
+_ACEOF
+
+    break
+fi
+
+  done
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5
+$as_echo_n "checking stack direction for C alloca... " >&6; }
+if ${ac_cv_c_stack_direction+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_c_stack_direction=0
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+find_stack_direction (int *addr, int depth)
+{
+  int dir, dummy = 0;
+  if (! addr)
+    addr = &dummy;
+  *addr = addr < &dummy ? 1 : addr == &dummy ? 0 : -1;
+  dir = depth ? find_stack_direction (addr, depth - 1) : 0;
+  return dir + dummy;
+}
+
+int
+main (int argc, char **argv)
+{
+  return find_stack_direction (0, argc + !argv + 20) < 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_c_stack_direction=1
+else
+  ac_cv_c_stack_direction=-1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5
+$as_echo "$ac_cv_c_stack_direction" >&6; }
+cat >>confdefs.h <<_ACEOF
+#define STACK_DIRECTION $ac_cv_c_stack_direction
+_ACEOF
+
+
+fi
+
+for ac_func in getopt_long
+do :
+  ac_fn_c_check_func "$LINENO" "getopt_long" "ac_cv_func_getopt_long"
+if test "x$ac_cv_func_getopt_long" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_GETOPT_LONG 1
+_ACEOF
+
+fi
+done
+
+
+case $target in
+  i*86-*-*)
+    have_x86=true
+    ;;
+  x86_64-*-*)
+    have_x86=true
+    have_x86_64=true
+    ;;
+  amd64-*-*)
+    have_x86=true
+    have_x86_64=true
+    ;;
+esac
+
+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+esac
+
+if test x$have_x86 = xtrue
+then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
+$as_echo_n "checking whether we can compile AVX code... " >&6; }
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vmovdqa %ymm0, %ymm1");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_AVX 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
+$as_echo_n "checking whether we can compile XOP code... " >&6; }
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vprotd \$7, %xmm0, %xmm1");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_XOP 1" >>confdefs.h
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
+$as_echo_n "checking whether we can compile AVX2 code... " >&6; }
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vpaddd %ymm0, %ymm1, %ymm2");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_AVX2 1" >>confdefs.h
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
+$as_echo_n "checking for json_loads in -ljansson... " >&6; }
+if ${ac_cv_lib_jansson_json_loads+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ljansson  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char json_loads ();
+int
+main ()
+{
+return json_loads ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_jansson_json_loads=yes
+else
+  ac_cv_lib_jansson_json_loads=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_jansson_json_loads" >&5
+$as_echo "$ac_cv_lib_jansson_json_loads" >&6; }
+if test "x$ac_cv_lib_jansson_json_loads" = xyes; then :
+  request_jansson=false
+else
+  request_jansson=true
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
+$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
+if ${ac_cv_lib_pthread_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthread  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_pthread_pthread_create=yes
+else
+  ac_cv_lib_pthread_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then :
+  PTHREAD_LIBS="-lpthread"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
+$as_echo_n "checking for pthread_create in -lpthreadGC2... " >&6; }
+if ${ac_cv_lib_pthreadGC2_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC2  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_pthreadGC2_pthread_create=yes
+else
+  ac_cv_lib_pthreadGC2_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes; then :
+  PTHREAD_LIBS="-lpthreadGC2"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
+$as_echo_n "checking for pthread_create in -lpthreadGC1... " >&6; }
+if ${ac_cv_lib_pthreadGC1_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC1  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_pthreadGC1_pthread_create=yes
+else
+  ac_cv_lib_pthreadGC1_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes; then :
+  PTHREAD_LIBS="-lpthreadGC1"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
+$as_echo_n "checking for pthread_create in -lpthreadGC... " >&6; }
+if ${ac_cv_lib_pthreadGC_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthreadGC  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_pthreadGC_pthread_create=yes
+else
+  ac_cv_lib_pthreadGC_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes; then :
+  PTHREAD_LIBS="-lpthreadGC"
+
+fi
+
+fi
+
+fi
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SSL_library_init in -lssl" >&5
+$as_echo_n "checking for SSL_library_init in -lssl... " >&6; }
+if ${ac_cv_lib_ssl_SSL_library_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lssl  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char SSL_library_init ();
+int
+main ()
+{
+return SSL_library_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_ssl_SSL_library_init=yes
+else
+  ac_cv_lib_ssl_SSL_library_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_ssl_SSL_library_init" >&5
+$as_echo "$ac_cv_lib_ssl_SSL_library_init" >&6; }
+if test "x$ac_cv_lib_ssl_SSL_library_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSSL 1
+_ACEOF
+
+  LIBS="-lssl $LIBS"
+
+else
+  as_fn_error $? "OpenSSL library required" "$LINENO" 5
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for EVP_DigestFinal_ex in -lcrypto" >&5
+$as_echo_n "checking for EVP_DigestFinal_ex in -lcrypto... " >&6; }
+if ${ac_cv_lib_crypto_EVP_DigestFinal_ex+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lcrypto  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char EVP_DigestFinal_ex ();
+int
+main ()
+{
+return EVP_DigestFinal_ex ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_crypto_EVP_DigestFinal_ex=yes
+else
+  ac_cv_lib_crypto_EVP_DigestFinal_ex=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_crypto_EVP_DigestFinal_ex" >&5
+$as_echo "$ac_cv_lib_crypto_EVP_DigestFinal_ex" >&6; }
+if test "x$ac_cv_lib_crypto_EVP_DigestFinal_ex" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBCRYPTO 1
+_ACEOF
+
+  LIBS="-lcrypto $LIBS"
+
+else
+  as_fn_error $? "OpenSSL library required" "$LINENO" 5
+fi
+
+
+ if test x$request_jansson = xtrue; then
+  WANT_JANSSON_TRUE=
+  WANT_JANSSON_FALSE='#'
+else
+  WANT_JANSSON_TRUE='#'
+  WANT_JANSSON_FALSE=
+fi
+
+ if test x$have_win32 = xtrue; then
+  HAVE_WINDOWS_TRUE=
+  HAVE_WINDOWS_FALSE='#'
+else
+  HAVE_WINDOWS_TRUE='#'
+  HAVE_WINDOWS_FALSE=
+fi
+
+ if test x$have_x86 = xtrue; then
+  ARCH_x86_TRUE=
+  ARCH_x86_FALSE='#'
+else
+  ARCH_x86_TRUE='#'
+  ARCH_x86_FALSE=
+fi
+
+ if test x$have_x86_64 = xtrue; then
+  ARCH_x86_64_TRUE=
+  ARCH_x86_64_FALSE='#'
+else
+  ARCH_x86_64_TRUE='#'
+  ARCH_x86_64_FALSE=
+fi
+
+
+if test x$request_jansson = xtrue
+then
+	JANSSON_LIBS="compat/jansson/libjansson.a"
+else
+	JANSSON_LIBS=-ljansson
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Check whether --with-libcurl was given.
+if test "${with_libcurl+set}" = set; then :
+  withval=$with_libcurl; _libcurl_with=$withval
+else
+  _libcurl_with=yes
+fi
+
+
+  if test "$_libcurl_with" != "no" ; then
+
+     for ac_prog in gawk mawk nawk awk
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_AWK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_AWK="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+AWK=$ac_cv_prog_AWK
+if test -n "$AWK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
+$as_echo "$AWK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$AWK" && break
+done
+
+
+     _libcurl_version_parse="eval $AWK '{split(\$NF,A,\".\"); X=256*256*A[1]+256*A[2]+A[3]; print X;}'"
+
+     _libcurl_try_link=yes
+
+     if test -d "$_libcurl_with" ; then
+        LIBCURL_CPPFLAGS="-I$withval/include"
+        _libcurl_ldflags="-L$withval/lib"
+        # Extract the first word of "curl-config", so it can be a program name with args.
+set dummy curl-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path__libcurl_config+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $_libcurl_config in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path__libcurl_config="$_libcurl_config" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in "$withval/bin"
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path__libcurl_config="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+_libcurl_config=$ac_cv_path__libcurl_config
+if test -n "$_libcurl_config"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_libcurl_config" >&5
+$as_echo "$_libcurl_config" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+     else
+        # Extract the first word of "curl-config", so it can be a program name with args.
+set dummy curl-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path__libcurl_config+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $_libcurl_config in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path__libcurl_config="$_libcurl_config" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path__libcurl_config="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+_libcurl_config=$ac_cv_path__libcurl_config
+if test -n "$_libcurl_config"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_libcurl_config" >&5
+$as_echo "$_libcurl_config" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+     fi
+
+     if test x$_libcurl_config != "x" ; then
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for the version of libcurl" >&5
+$as_echo_n "checking for the version of libcurl... " >&6; }
+if ${libcurl_cv_lib_curl_version+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  libcurl_cv_lib_curl_version=`$_libcurl_config --version | $AWK '{print $2}'`
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_curl_version" >&5
+$as_echo "$libcurl_cv_lib_curl_version" >&6; }
+
+        _libcurl_version=`echo $libcurl_cv_lib_curl_version | $_libcurl_version_parse`
+        _libcurl_wanted=`echo 7.15.2 | $_libcurl_version_parse`
+
+        if test $_libcurl_wanted -gt 0 ; then
+           { $as_echo "$as_me:${as_lineno-$LINENO}: checking for libcurl >= version 7.15.2" >&5
+$as_echo_n "checking for libcurl >= version 7.15.2... " >&6; }
+if ${libcurl_cv_lib_version_ok+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+              if test $_libcurl_version -ge $_libcurl_wanted ; then
+                 libcurl_cv_lib_version_ok=yes
+              else
+                 libcurl_cv_lib_version_ok=no
+              fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_version_ok" >&5
+$as_echo "$libcurl_cv_lib_version_ok" >&6; }
+        fi
+
+        if test $_libcurl_wanted -eq 0 || test x$libcurl_cv_lib_version_ok = xyes ; then
+           if test x"$LIBCURL_CPPFLAGS" = "x" ; then
+              LIBCURL_CPPFLAGS=`$_libcurl_config --cflags`
+           fi
+           if test x"$LIBCURL" = "x" ; then
+              LIBCURL=`$_libcurl_config --libs`
+
+              # This is so silly, but Apple actually has a bug in their
+              # curl-config script.  Fixed in Tiger, but there are still
+              # lots of Panther installs around.
+              case "${host}" in
+                 powerpc-apple-darwin7*)
+                    LIBCURL=`echo $LIBCURL | sed -e 's|-arch i386||g'`
+                 ;;
+              esac
+           fi
+
+           # All curl-config scripts support --feature
+           _libcurl_features=`$_libcurl_config --feature`
+
+           # Is it modern enough to have --protocols? (7.12.4)
+           if test $_libcurl_version -ge 461828 ; then
+              _libcurl_protocols=`$_libcurl_config --protocols`
+           fi
+        else
+           _libcurl_try_link=no
+        fi
+
+        unset _libcurl_wanted
+     fi
+
+     if test $_libcurl_try_link = yes ; then
+
+        # we didn't find curl-config, so let's see if the user-supplied
+        # link line (or failing that, "-lcurl") is enough.
+        LIBCURL=${LIBCURL-"$_libcurl_ldflags -lcurl"}
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether libcurl is usable" >&5
+$as_echo_n "checking whether libcurl is usable... " >&6; }
+if ${libcurl_cv_lib_curl_usable+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+           _libcurl_save_cppflags=$CPPFLAGS
+           CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS"
+           _libcurl_save_libs=$LIBS
+           LIBS="$LIBCURL $LIBS"
+
+           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <curl/curl.h>
+int
+main ()
+{
+
+/* Try and use a few common options to force a failure if we are
+   missing symbols or can't link. */
+int x;
+curl_easy_setopt(NULL,CURLOPT_URL,NULL);
+x=CURL_ERROR_SIZE;
+x=CURLOPT_WRITEFUNCTION;
+x=CURLOPT_FILE;
+x=CURLOPT_ERRORBUFFER;
+x=CURLOPT_STDERR;
+x=CURLOPT_VERBOSE;
+if (x) ;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  libcurl_cv_lib_curl_usable=yes
+else
+  libcurl_cv_lib_curl_usable=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+           CPPFLAGS=$_libcurl_save_cppflags
+           LIBS=$_libcurl_save_libs
+           unset _libcurl_save_cppflags
+           unset _libcurl_save_libs
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_curl_usable" >&5
+$as_echo "$libcurl_cv_lib_curl_usable" >&6; }
+
+        if test $libcurl_cv_lib_curl_usable = yes ; then
+
+           # Does curl_free() exist in this version of libcurl?
+           # If not, fake it with free()
+
+           _libcurl_save_cppflags=$CPPFLAGS
+           CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
+           _libcurl_save_libs=$LIBS
+           LIBS="$LIBS $LIBCURL"
+
+           ac_fn_c_check_func "$LINENO" "curl_free" "ac_cv_func_curl_free"
+if test "x$ac_cv_func_curl_free" = xyes; then :
+
+else
+
+$as_echo "#define curl_free free" >>confdefs.h
+
+fi
+
+
+           CPPFLAGS=$_libcurl_save_cppflags
+           LIBS=$_libcurl_save_libs
+           unset _libcurl_save_cppflags
+           unset _libcurl_save_libs
+
+
+$as_echo "#define HAVE_LIBCURL 1" >>confdefs.h
+
+
+
+
+           for _libcurl_feature in $_libcurl_features ; do
+              cat >>confdefs.h <<_ACEOF
+#define `$as_echo "libcurl_feature_$_libcurl_feature" | $as_tr_cpp` 1
+_ACEOF
+
+              eval `$as_echo "libcurl_feature_$_libcurl_feature" | $as_tr_sh`=yes
+           done
+
+           if test "x$_libcurl_protocols" = "x" ; then
+
+              # We don't have --protocols, so just assume that all
+              # protocols are available
+              _libcurl_protocols="HTTP FTP FILE TELNET LDAP DICT TFTP"
+
+              if test x$libcurl_feature_SSL = xyes ; then
+                 _libcurl_protocols="$_libcurl_protocols HTTPS"
+
+                 # FTPS wasn't standards-compliant until version
+                 # 7.11.0 (0x070b00 == 461568)
+                 if test $_libcurl_version -ge 461568; then
+                    _libcurl_protocols="$_libcurl_protocols FTPS"
+                 fi
+              fi
+
+              # RTSP, IMAP, POP3 and SMTP were added in
+              # 7.20.0 (0x071400 == 463872)
+              if test $_libcurl_version -ge 463872; then
+                 _libcurl_protocols="$_libcurl_protocols RTSP IMAP POP3 SMTP"
+              fi
+           fi
+
+           for _libcurl_protocol in $_libcurl_protocols ; do
+              cat >>confdefs.h <<_ACEOF
+#define `$as_echo "libcurl_protocol_$_libcurl_protocol" | $as_tr_cpp` 1
+_ACEOF
+
+              eval `$as_echo "libcurl_protocol_$_libcurl_protocol" | $as_tr_sh`=yes
+           done
+        else
+           unset LIBCURL
+           unset LIBCURL_CPPFLAGS
+        fi
+     fi
+
+     unset _libcurl_try_link
+     unset _libcurl_version_parse
+     unset _libcurl_config
+     unset _libcurl_feature
+     unset _libcurl_features
+     unset _libcurl_protocol
+     unset _libcurl_protocols
+     unset _libcurl_version
+     unset _libcurl_ldflags
+  fi
+
+  if test x$_libcurl_with = xno || test x$libcurl_cv_lib_curl_usable != xyes ; then
+     # This is the IF-NO path
+     as_fn_error $? "Missing required libcurl >= 7.15.2" "$LINENO" 5
+  else
+     # This is the IF-YES path
+     :
+  fi
+
+  unset _libcurl_with
+
+
+
+
+
+
+
+ac_config_files="$ac_config_files Makefile compat/Makefile compat/jansson/Makefile"
+
+
+ARCH=`uname -m`
+if [ $ARCH == "x86_64" ];
+then
+  SUFFIX="64"
+else
+  SUFFIX=""
+fi
+
+
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then :
+  withval=$with_cuda;
+fi
+
+
+if test -n "$with_cuda"
+then
+   CUDA_CFLAGS="-I$with_cuda/include"
+   CUDA_LIBS="-lcudart"
+   CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
+   NVCC="$with_cuda/bin/nvcc"
+else
+   CUDA_CFLAGS="-I/usr/local/cuda/include"
+   CUDA_LIBS="-lcudart -static-libstdc++"
+   CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
+   NVCC="nvcc"
+fi
+
+
+
+
+
+
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+DEFS=-DHAVE_CONFIG_H
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5
+$as_echo_n "checking that generated files are newer than configure... " >&6; }
+   if test -n "$am_sleep_pid"; then
+     # Hide warnings about reused PIDs.
+     wait $am_sleep_pid 2>/dev/null
+   fi
+   { $as_echo "$as_me:${as_lineno-$LINENO}: result: done" >&5
+$as_echo "done" >&6; }
+ if test -n "$EXEEXT"; then
+  am__EXEEXT_TRUE=
+  am__EXEEXT_FALSE='#'
+else
+  am__EXEEXT_TRUE='#'
+  am__EXEEXT_FALSE=
+fi
+
+if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
+  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
+  as_fn_error $? "conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${WANT_JANSSON_TRUE}" && test -z "${WANT_JANSSON_FALSE}"; then
+  as_fn_error $? "conditional \"WANT_JANSSON\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${HAVE_WINDOWS_TRUE}" && test -z "${HAVE_WINDOWS_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_WINDOWS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
+  as_fn_error $? "conditional \"ARCH_x86\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
+  as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by ccminer $as_me 2014.03.18, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+case $ac_config_headers in *"
+"*) set x $ac_config_headers; shift; ac_config_headers=$*;;
+esac
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+config_headers="$ac_config_headers"
+config_commands="$ac_config_commands"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+      --header=FILE[:TEMPLATE]
+                   instantiate the configuration header FILE
+
+Configuration files:
+$config_files
+
+Configuration headers:
+$config_headers
+
+Configuration commands:
+$config_commands
+
+Report bugs to the package provider."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+ccminer config.status 2014.03.18
+configured by $0, generated by GNU Autoconf 2.69,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+INSTALL='$INSTALL'
+MKDIR_P='$MKDIR_P'
+AWK='$AWK'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    $as_echo "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    $as_echo "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --header | --heade | --head | --hea )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    as_fn_append CONFIG_HEADERS " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h)
+    # Conflict between --help and --header
+    as_fn_error $? "ambiguous option: \`$1'
+Try \`$0 --help' for more information.";;
+  --help | --hel | -h )
+    $as_echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+#
+# INIT-COMMANDS
+#
+AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "cpuminer-config.h") CONFIG_HEADERS="$CONFIG_HEADERS cpuminer-config.h" ;;
+    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
+    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+    "compat/Makefile") CONFIG_FILES="$CONFIG_FILES compat/Makefile" ;;
+    "compat/jansson/Makefile") CONFIG_FILES="$CONFIG_FILES compat/jansson/Makefile" ;;
+
+  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
+  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+# Set up the scripts for CONFIG_HEADERS section.
+# No need to generate them if there are no CONFIG_HEADERS.
+# This happens for instance with `./config.status Makefile'.
+if test -n "$CONFIG_HEADERS"; then
+cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
+BEGIN {
+_ACEOF
+
+# Transform confdefs.h into an awk script `defines.awk', embedded as
+# here-document in config.status, that substitutes the proper values into
+# config.h.in to produce config.h.
+
+# Create a delimiter string that does not exist in confdefs.h, to ease
+# handling of long lines.
+ac_delim='%!_!# '
+for ac_last_try in false false :; do
+  ac_tt=`sed -n "/$ac_delim/p" confdefs.h`
+  if test -z "$ac_tt"; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+
+# For the awk script, D is an array of macro values keyed by name,
+# likewise P contains macro parameters if any.  Preserve backslash
+# newline sequences.
+
+ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
+sed -n '
+s/.\{148\}/&'"$ac_delim"'/g
+t rset
+:rset
+s/^[	 ]*#[	 ]*define[	 ][	 ]*/ /
+t def
+d
+:def
+s/\\$//
+t bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3"/p
+s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2"/p
+d
+:bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3\\\\\\n"\\/p
+t cont
+s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p
+t cont
+d
+:cont
+n
+s/.\{148\}/&'"$ac_delim"'/g
+t clear
+:clear
+s/\\$//
+t bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/"/p
+d
+:bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p
+b cont
+' <confdefs.h | sed '
+s/'"$ac_delim"'/"\\\
+"/g' >>$CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  for (key in D) D_is_set[key] = 1
+  FS = ""
+}
+/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ {
+  line = \$ 0
+  split(line, arg, " ")
+  if (arg[1] == "#") {
+    defundef = arg[2]
+    mac1 = arg[3]
+  } else {
+    defundef = substr(arg[1], 2)
+    mac1 = arg[2]
+  }
+  split(mac1, mac2, "(") #)
+  macro = mac2[1]
+  prefix = substr(line, 1, index(line, defundef) - 1)
+  if (D_is_set[macro]) {
+    # Preserve the white space surrounding the "#".
+    print prefix "define", macro P[macro] D[macro]
+    next
+  } else {
+    # Replace #undef with comments.  This is necessary, for example,
+    # in the case of _POSIX_SOURCE, which is predefined and required
+    # on some systems where configure will not decide to define it.
+    if (defundef == "undef") {
+      print "/*", prefix defundef, macro, "*/"
+      next
+    }
+  }
+}
+{ print }
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+  as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
+fi # test -n "$CONFIG_HEADERS"
+
+
+eval set X "  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS"
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`$as_echo "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+  case $INSTALL in
+  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
+  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
+  esac
+  ac_MKDIR_P=$MKDIR_P
+  case $MKDIR_P in
+  [\\/$]* | ?:[\\/]* ) ;;
+  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
+  esac
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+s&@INSTALL@&$ac_INSTALL&;t t
+s&@MKDIR_P@&$ac_MKDIR_P&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+  :H)
+  #
+  # CONFIG_HEADER
+  #
+  if test x"$ac_file" != x-; then
+    {
+      $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
+    } >"$ac_tmp/config.h" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
+$as_echo "$as_me: $ac_file is unchanged" >&6;}
+    else
+      rm -f "$ac_file"
+      mv "$ac_tmp/config.h" "$ac_file" \
+	|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    fi
+  else
+    $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
+      || as_fn_error $? "could not create -" "$LINENO" 5
+  fi
+# Compute "$ac_file"'s index in $config_headers.
+_am_arg="$ac_file"
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $_am_arg | $_am_arg:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $_am_arg" >`$as_dirname -- "$_am_arg" ||
+$as_expr X"$_am_arg" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$_am_arg" : 'X\(//\)[^/]' \| \
+	 X"$_am_arg" : 'X\(//\)$' \| \
+	 X"$_am_arg" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$_am_arg" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`/stamp-h$_am_stamp_count
+ ;;
+
+  :C)  { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
+$as_echo "$as_me: executing $ac_file commands" >&6;}
+ ;;
+  esac
+
+
+  case $ac_file$ac_mode in
+    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
+  # Older Autoconf quotes --file arguments for eval, but not when files
+  # are listed without --file.  Let's play safe and only enable the eval
+  # if we detect the quoting.
+  case $CONFIG_FILES in
+  *\'*) eval set x "$CONFIG_FILES" ;;
+  *)   set x $CONFIG_FILES ;;
+  esac
+  shift
+  for mf
+  do
+    # Strip MF so we end up with the name of the file.
+    mf=`echo "$mf" | sed -e 's/:.*$//'`
+    # Check whether this is an Automake generated Makefile or not.
+    # We used to match only the files named 'Makefile.in', but
+    # some people rename them; so instead we look at the file content.
+    # Grep'ing the first line is not enough: some people post-process
+    # each Makefile.in and add a new line on top of each file to say so.
+    # Grep'ing the whole file is not good either: AIX grep has a line
+    # limit of 2048, but all sed's we know have understand at least 4000.
+    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
+      dirpart=`$as_dirname -- "$mf" ||
+$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$mf" : 'X\(//\)[^/]' \| \
+	 X"$mf" : 'X\(//\)$' \| \
+	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$mf" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+    else
+      continue
+    fi
+    # Extract the definition of DEPDIR, am__include, and am__quote
+    # from the Makefile without running 'make'.
+    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+    test -z "$DEPDIR" && continue
+    am__include=`sed -n 's/^am__include = //p' < "$mf"`
+    test -z "$am__include" && continue
+    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+    # Find all dependency output files, they are included files with
+    # $(DEPDIR) in their names.  We invoke sed twice because it is the
+    # simplest approach to changing $(DEPDIR) to its actual value in the
+    # expansion.
+    for file in `sed -n "
+      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do
+      # Make sure the directory exists.
+      test -f "$dirpart/$file" && continue
+      fdir=`$as_dirname -- "$file" ||
+$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$file" : 'X\(//\)[^/]' \| \
+	 X"$file" : 'X\(//\)$' \| \
+	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      as_dir=$dirpart/$fdir; as_fn_mkdir_p
+      # echo "creating $dirpart/$file"
+      echo '# dummy' > "$dirpart/$file"
+    done
+  done
+}
+ ;;
+
+  esac
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..52572fc
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,164 @@
+AC_INIT([ccminer], [2014.03.18])
+
+AC_PREREQ([2.59c])
+AC_CANONICAL_SYSTEM
+AC_CONFIG_SRCDIR([cpu-miner.c])
+AM_INIT_AUTOMAKE([gnu])
+AC_CONFIG_HEADERS([cpuminer-config.h])
+
+dnl Make sure anyone changing configure.ac/Makefile.am has a clue
+AM_MAINTAINER_MODE
+
+dnl Checks for programs
+AC_PROG_CC_C99
+AC_PROG_GCC_TRADITIONAL
+AM_PROG_CC_C_O
+AM_PROG_AS
+AC_PROG_RANLIB
+AC_PROG_CXX
+AC_OPENMP
+
+dnl Checks for header files
+AC_HEADER_STDC
+AC_CHECK_HEADERS([sys/endian.h sys/param.h syslog.h])
+# sys/sysctl.h requires sys/types.h on FreeBSD
+# sys/sysctl.h requires sys/param.h on OpenBSD
+AC_CHECK_HEADERS([sys/sysctl.h], [], [],
+[#include <sys/types.h>
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+])
+
+AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc], [], [],
+[AC_INCLUDES_DEFAULT
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+])
+
+AC_FUNC_ALLOCA
+AC_CHECK_FUNCS([getopt_long])
+
+case $target in
+  i*86-*-*)
+    have_x86=true
+    ;;
+  x86_64-*-*)
+    have_x86=true
+    have_x86_64=true
+    ;;
+  amd64-*-*)
+    have_x86=true
+    have_x86_64=true
+    ;;
+esac
+
+PTHREAD_FLAGS="-pthread"
+WS2_LIBS=""
+
+case $target in
+  *-*-mingw*)
+    have_win32=true
+    PTHREAD_FLAGS=""
+    WS2_LIBS="-lws2_32"
+    ;;
+esac
+
+if test x$have_x86 = xtrue
+then
+  AC_MSG_CHECKING(whether we can compile AVX code)
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
+    AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
+    AC_MSG_RESULT(yes)
+    AC_MSG_CHECKING(whether we can compile XOP code)
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
+      AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
+      AC_MSG_RESULT(yes)
+    ,
+      AC_MSG_RESULT(no)
+      AC_MSG_WARN([The assembler does not support the XOP instruction set.])
+    )
+    AC_MSG_CHECKING(whether we can compile AVX2 code)
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
+      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
+      AC_MSG_RESULT(yes)
+    ,
+      AC_MSG_RESULT(no)
+      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
+    )
+  ,
+    AC_MSG_RESULT(no)
+    AC_MSG_WARN([The assembler does not support the AVX instruction set.])
+  )
+fi
+
+AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
+AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
+  AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
+    AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1",
+      AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
+))))
+
+AC_CHECK_LIB([ssl],[SSL_library_init], [], [AC_MSG_ERROR([OpenSSL library required])])
+AC_CHECK_LIB([crypto],[EVP_DigestFinal_ex], [], [AC_MSG_ERROR([OpenSSL library required])])
+
+AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
+AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
+AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
+AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
+
+if test x$request_jansson = xtrue
+then
+	JANSSON_LIBS="compat/jansson/libjansson.a"
+else
+	JANSSON_LIBS=-ljansson
+fi
+
+LIBCURL_CHECK_CONFIG(, 7.15.2, ,
+  [AC_MSG_ERROR([Missing required libcurl >= 7.15.2])])
+
+AC_SUBST(JANSSON_LIBS)
+AC_SUBST(PTHREAD_FLAGS)
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(WS2_LIBS)
+
+AC_CONFIG_FILES([
+	Makefile
+	compat/Makefile
+	compat/jansson/Makefile
+	])
+
+dnl find out what version we are running
+ARCH=`uname -m`
+if [[ $ARCH == "x86_64" ]];
+then
+  SUFFIX="64"
+else
+  SUFFIX=""
+fi
+
+dnl Setup CUDA paths
+AC_ARG_WITH([cuda],
+   [  --with-cuda=PATH    prefix where cuda is installed [default=/usr/local/cuda]])
+
+if test -n "$with_cuda"
+then
+   CUDA_CFLAGS="-I$with_cuda/include"
+   CUDA_LIBS="-lcudart"
+   CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
+   NVCC="$with_cuda/bin/nvcc"
+else
+   CUDA_CFLAGS="-I/usr/local/cuda/include"
+   CUDA_LIBS="-lcudart -static-libstdc++"
+   CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
+   NVCC="nvcc"
+fi
+AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_LIBS)
+AC_SUBST(CUDA_LDFLAGS)
+AC_SUBST(NVCC)
+
+AC_SUBST(OPENMP_CFLAGS) 
+
+AC_OUTPUT
diff --git a/configure.sh b/configure.sh
new file mode 100644
index 0000000..134abd1
--- /dev/null
+++ b/configure.sh
@@ -0,0 +1 @@
+./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
diff --git a/cpu-miner.c b/cpu-miner.c
new file mode 100644
index 0000000..d3cb4ef
--- /dev/null
+++ b/cpu-miner.c
@@ -0,0 +1,1523 @@
+/*
+ * Copyright 2010 Jeff Garzik
+ * Copyright 2012-2014 pooler
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include "cpuminer-config.h"
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#if HAVE_SYS_SYSCTL_H
+#include <sys/types.h>
+#if HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+#endif
+#include <jansson.h>
+#include <curl/curl.h>
+#include "compat.h"
+#include "miner.h"
+
+#define PROGRAM_NAME		"minerd"
+#define LP_SCANTIME		60
+#define HEAVYCOIN_BLKHDR_SZ		84
+
+// from heavy.cu
+#ifdef __cplusplus
+extern "C"
+#endif
+int cuda_num_devices();
+
+
+#ifdef __linux /* Linux specific policy and affinity management */
+#include <sched.h>
+static inline void drop_policy(void)
+{
+	struct sched_param param;
+	param.sched_priority = 0;
+
+#ifdef SCHED_IDLE
+	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
+#endif
+#ifdef SCHED_BATCH
+		sched_setscheduler(0, SCHED_BATCH, &param);
+#endif
+}
+
+static inline void affine_to_cpu(int id, int cpu)
+{
+	cpu_set_t set;
+
+	CPU_ZERO(&set);
+	CPU_SET(cpu, &set);
+	sched_setaffinity(0, sizeof(&set), &set);
+}
+#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
+#include <sys/cpuset.h>
+static inline void drop_policy(void)
+{
+}
+
+static inline void affine_to_cpu(int id, int cpu)
+{
+	cpuset_t set;
+	CPU_ZERO(&set);
+	CPU_SET(cpu, &set);
+	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
+}
+#else
+static inline void drop_policy(void)
+{
+}
+
+static inline void affine_to_cpu(int id, int cpu)
+{
+}
+#endif
+		
+enum workio_commands {
+	WC_GET_WORK,
+	WC_SUBMIT_WORK,
+};
+
+struct workio_cmd {
+	enum workio_commands	cmd;
+	struct thr_info		*thr;
+	union {
+		struct work	*work;
+	} u;
+};
+
+typedef enum {
+	ALGO_SCRYPT,		/* scrypt(1024,1,1) */
+	ALGO_SHA256D,		/* SHA-256d */
+	ALGO_HEAVY,		/* Heavycoin hash */
+	ALGO_FUGUE256,		/* Fugue256 */
+} sha256_algos;
+
+static const char *algo_names[] = {
+	"scrypt",
+	"sha256d",
+	"heavy",
+	"fugue256"
+};
+
+bool opt_debug = false;
+bool opt_protocol = false;
+static bool opt_benchmark = false;
+bool want_longpoll = true;
+bool have_longpoll = false;
+bool want_stratum = true;
+bool have_stratum = false;
+static bool submit_old = false;
+bool use_syslog = false;
+static bool opt_background = false;
+static bool opt_quiet = false;
+static int opt_retries = -1;
+static int opt_fail_pause = 30;
+int opt_timeout = 270;
+static int opt_scantime = 5;
+static json_t *opt_config;
+static const bool opt_time = true;
+static sha256_algos opt_algo = ALGO_HEAVY;
+static int opt_n_threads;
+bool opt_trust_pool = false;
+uint16_t opt_vote = 9999;
+static int num_processors;
+static char *rpc_url;
+static char *rpc_userpass;
+static char *rpc_user, *rpc_pass;
+char *opt_cert;
+char *opt_proxy;
+long opt_proxy_type;
+struct thr_info *thr_info;
+static int work_thr_id;
+int longpoll_thr_id = -1;
+int stratum_thr_id = -1;
+struct work_restart *work_restart = NULL;
+static struct stratum_ctx stratum;
+
+pthread_mutex_t applog_lock;
+static pthread_mutex_t stats_lock;
+
+static unsigned long accepted_count = 0L;
+static unsigned long rejected_count = 0L;
+static double *thr_hashrates;
+
+#ifdef HAVE_GETOPT_LONG
+#include <getopt.h>
+#else
+struct option {
+	const char *name;
+	int has_arg;
+	int *flag;
+	int val;
+};
+#endif
+
+static char const usage[] = "\
+Usage: " PROGRAM_NAME " [OPTIONS]\n\
+Options:\n\
+  -a, --algo=ALGO       specify the algorithm to use\n\
+                          scrypt    scrypt(1024, 1, 1) (default)\n\
+                          sha256d   SHA-256d\n\
+                          heavy     Heavycoin hash\n\
+  -v, --vote=VOTE       block reward vote\n\
+  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
+  -o, --url=URL         URL of mining server\n\
+  -O, --userpass=U:P    username:password pair for mining server\n\
+  -u, --user=USERNAME   username for mining server\n\
+  -p, --pass=PASSWORD   password for mining server\n\
+      --cert=FILE       certificate for mining server using SSL\n\
+  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy\n\
+  -t, --threads=N       number of miner threads (default: number of processors)\n\
+  -r, --retries=N       number of times to retry if a network call fails\n\
+                          (default: retry indefinitely)\n\
+  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
+  -T, --timeout=N       network timeout, in seconds (default: 270)\n\
+  -s, --scantime=N      upper bound on time spent scanning current work when\n\
+                          long polling is unavailable, in seconds (default: 5)\n\
+      --no-longpoll     disable X-Long-Polling support\n\
+      --no-stratum      disable X-Stratum support\n\
+  -q, --quiet           disable per-thread hashmeter output\n\
+  -D, --debug           enable debug output\n\
+  -P, --protocol-dump   verbose dump of protocol-level activities\n"
+#ifdef HAVE_SYSLOG_H
+"\
+  -S, --syslog          use system log for output messages\n"
+#endif
+#ifndef WIN32
+"\
+  -B, --background      run the miner in the background\n"
+#endif
+"\
+      --benchmark       run in offline benchmark mode\n\
+  -c, --config=FILE     load a JSON-format configuration file\n\
+  -V, --version         display version information and exit\n\
+  -h, --help            display this help text and exit\n\
+";
+
+static char const short_options[] =
+#ifndef WIN32
+	"B"
+#endif
+#ifdef HAVE_SYSLOG_H
+	"S"
+#endif
+	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
+
+static struct option const options[] = {
+	{ "algo", 1, NULL, 'a' },
+#ifndef WIN32
+	{ "background", 0, NULL, 'B' },
+#endif
+	{ "benchmark", 0, NULL, 1005 },
+	{ "cert", 1, NULL, 1001 },
+	{ "config", 1, NULL, 'c' },
+	{ "debug", 0, NULL, 'D' },
+	{ "help", 0, NULL, 'h' },
+	{ "no-longpoll", 0, NULL, 1003 },
+	{ "no-stratum", 0, NULL, 1007 },
+	{ "pass", 1, NULL, 'p' },
+	{ "protocol-dump", 0, NULL, 'P' },
+	{ "proxy", 1, NULL, 'x' },
+	{ "quiet", 0, NULL, 'q' },
+	{ "retries", 1, NULL, 'r' },
+	{ "retry-pause", 1, NULL, 'R' },
+	{ "scantime", 1, NULL, 's' },
+#ifdef HAVE_SYSLOG_H
+	{ "syslog", 0, NULL, 'S' },
+#endif
+	{ "threads", 1, NULL, 't' },
+	{ "vote", 1, NULL, 'v' },
+	{ "trust-pool", 0, NULL, 'm' },
+	{ "timeout", 1, NULL, 'T' },
+	{ "url", 1, NULL, 'o' },
+	{ "user", 1, NULL, 'u' },
+	{ "userpass", 1, NULL, 'O' },
+	{ "version", 0, NULL, 'V' },
+	{ 0, 0, 0, 0 }
+};
+
+struct work {
+	uint32_t data[32];
+	uint32_t target[8];
+	uint32_t maxvote;
+
+	char job_id[128];
+	size_t xnonce2_len;
+	unsigned char xnonce2[32];
+};
+
+static struct work g_work;
+static time_t g_work_time;
+static pthread_mutex_t g_work_lock;
+
+static bool jobj_binary(const json_t *obj, const char *key,
+			void *buf, size_t buflen)
+{
+	const char *hexstr;
+	json_t *tmp;
+
+	tmp = json_object_get(obj, key);
+	if (unlikely(!tmp)) {
+		applog(LOG_ERR, "JSON key '%s' not found", key);
+		return false;
+	}
+	hexstr = json_string_value(tmp);
+	if (unlikely(!hexstr)) {
+		applog(LOG_ERR, "JSON key '%s' is not a string", key);
+		return false;
+	}
+	if (!hex2bin((unsigned char*)buf, hexstr, buflen))
+		return false;
+
+	return true;
+}
+
+static bool work_decode(const json_t *val, struct work *work)
+{
+	int i;
+	
+	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) {
+		applog(LOG_ERR, "JSON inval data");
+		goto err_out;
+	}
+	if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) {
+		applog(LOG_ERR, "JSON inval target");
+		goto err_out;
+	}
+	if (opt_algo == ALGO_HEAVY) {
+		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
+			work->maxvote = 1024;
+		}
+	} else work->maxvote = 0;
+
+	for (i = 0; i < ARRAY_SIZE(work->data); i++)
+		work->data[i] = le32dec(work->data + i);
+	for (i = 0; i < ARRAY_SIZE(work->target); i++)
+		work->target[i] = le32dec(work->target + i);
+
+	return true;
+
+err_out:
+	return false;
+}
+
+static void share_result(int result, const char *reason)
+{
+	char s[345];
+	double hashrate;
+	int i;
+
+	hashrate = 0.;
+	pthread_mutex_lock(&stats_lock);
+	for (i = 0; i < opt_n_threads; i++)
+		hashrate += thr_hashrates[i];
+	result ? accepted_count++ : rejected_count++;
+	pthread_mutex_unlock(&stats_lock);
+	
+	sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
+	applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s",
+		   accepted_count,
+		   accepted_count + rejected_count,
+		   100. * accepted_count / (accepted_count + rejected_count),
+		   s,
+		   result ? "(yay!!!)" : "(booooo)");
+
+	if (opt_debug && reason)
+		applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason);
+}
+
+static bool submit_upstream_work(CURL *curl, struct work *work)
+{
+	char *str = NULL;
+	json_t *val, *res, *reason;
+	char s[345];
+	int i;
+	bool rc = false;
+
+	/* pass if the previous hash is not the current previous hash */
+	if (memcmp(work->data + 1, g_work.data + 1, 32)) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
+		return true;
+	}
+
+	if (have_stratum) {
+		uint32_t ntime, nonce;
+		uint16_t nvote;
+		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
+
+		le32enc(&ntime, work->data[17]);
+		le32enc(&nonce, work->data[19]);
+		be16enc(&nvote, *((uint16_t*)&work->data[20]));
+
+		ntimestr = bin2hex((const unsigned char *)(&ntime), 4);
+		noncestr = bin2hex((const unsigned char *)(&nonce), 4);
+		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
+		nvotestr = bin2hex((const unsigned char *)(&nvote), 2);
+		if (opt_algo == ALGO_HEAVY) {
+			sprintf(s,
+				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr);
+		} else {
+			sprintf(s,
+				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr);
+		}
+		free(ntimestr);
+		free(noncestr);
+		free(xnonce2str);
+		free(nvotestr);
+
+		if (unlikely(!stratum_send_line(&stratum, s))) {
+			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
+			goto out;
+		}
+	} else {
+
+		/* build hex string */
+
+                if (opt_algo != ALGO_HEAVY) {
+                    for (i = 0; i < ARRAY_SIZE(work->data); i++)
+			le32enc(work->data + i, work->data[i]);
+                }
+		str = bin2hex((unsigned char *)work->data, sizeof(work->data));
+		if (unlikely(!str)) {
+			applog(LOG_ERR, "submit_upstream_work OOM");
+			goto out;
+		}
+
+		/* build JSON-RPC request */
+		sprintf(s,
+			"{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n",
+			str);
+
+		/* issue JSON-RPC request */
+		val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL);
+		if (unlikely(!val)) {
+			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
+			goto out;
+		}
+
+		res = json_object_get(val, "result");
+		reason = json_object_get(val, "reject-reason");
+		share_result(json_is_true(res), reason ? json_string_value(reason) : NULL);
+
+		json_decref(val);
+	}
+
+	rc = true;
+
+out:
+	free(str);
+	return rc;
+}
+
+static const char *rpc_req =
+	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+
+static bool get_upstream_work(CURL *curl, struct work *work)
+{
+	json_t *val;
+	bool rc;
+	struct timeval tv_start, tv_end, diff;
+
+	gettimeofday(&tv_start, NULL);
+	val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req,
+			    want_longpoll, false, NULL);
+	gettimeofday(&tv_end, NULL);
+
+	if (have_stratum) {
+		if (val)
+			json_decref(val);
+		return true;
+	}
+
+	if (!val)
+		return false;
+
+	rc = work_decode(json_object_get(val, "result"), work);
+
+	if (opt_debug && rc) {
+		timeval_subtract(&diff, &tv_end, &tv_start);
+		applog(LOG_DEBUG, "DEBUG: got new work in %d ms",
+		       diff.tv_sec * 1000 + diff.tv_usec / 1000);
+	}
+
+	json_decref(val);
+
+	return rc;
+}
+
+static void workio_cmd_free(struct workio_cmd *wc)
+{
+	if (!wc)
+		return;
+
+	switch (wc->cmd) {
+	case WC_SUBMIT_WORK:
+		free(wc->u.work);
+		break;
+	default: /* do nothing */
+		break;
+	}
+
+	memset(wc, 0, sizeof(*wc));	/* poison */
+	free(wc);
+}
+
+static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
+{
+	struct work *ret_work;
+	int failures = 0;
+
+	ret_work = (struct work*)calloc(1, sizeof(*ret_work));
+	if (!ret_work)
+		return false;
+
+	/* obtain new work from bitcoin via JSON-RPC */
+	while (!get_upstream_work(curl, ret_work)) {
+		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
+			free(ret_work);
+			return false;
+		}
+
+		/* pause, then restart work-request loop */
+		applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds",
+			opt_fail_pause);
+		sleep(opt_fail_pause);
+	}
+
+	/* send work to requesting thread */
+	if (!tq_push(wc->thr->q, ret_work))
+		free(ret_work);
+
+	return true;
+}
+
+static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
+{
+	int failures = 0;
+
+	/* submit solution to bitcoin via JSON-RPC */
+	while (!submit_upstream_work(curl, wc->u.work)) {
+		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+			applog(LOG_ERR, "...terminating workio thread");
+			return false;
+		}
+
+		/* pause, then restart work-request loop */
+		applog(LOG_ERR, "...retry after %d seconds",
+			opt_fail_pause);
+		sleep(opt_fail_pause);
+	}
+
+	return true;
+}
+
+static void *workio_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info*)userdata;
+	CURL *curl;
+	bool ok = true;
+
+	curl = curl_easy_init();
+	if (unlikely(!curl)) {
+		applog(LOG_ERR, "CURL initialization failed");
+		return NULL;
+	}
+
+	while (ok) {
+		struct workio_cmd *wc;
+
+		/* wait for workio_cmd sent to us, on our queue */
+		wc = (struct workio_cmd *)tq_pop(mythr->q, NULL);
+		if (!wc) {
+			ok = false;
+			break;
+		}
+
+		/* process workio_cmd */
+		switch (wc->cmd) {
+		case WC_GET_WORK:
+			ok = workio_get_work(wc, curl);
+			break;
+		case WC_SUBMIT_WORK:
+			ok = workio_submit_work(wc, curl);
+			break;
+
+		default:		/* should never happen */
+			ok = false;
+			break;
+		}
+
+		workio_cmd_free(wc);
+	}
+
+	tq_freeze(mythr->q);
+	curl_easy_cleanup(curl);
+
+	return NULL;
+}
+
+static bool get_work(struct thr_info *thr, struct work *work)
+{
+	struct workio_cmd *wc;
+	struct work *work_heap;
+
+	if (opt_benchmark) {
+		memset(work->data, 0x55, 76);
+		work->data[17] = swab32((uint32_t)time(NULL));
+		memset(work->data + 19, 0x00, 52);
+		work->data[20] = 0x80000000;
+		work->data[31] = 0x00000280;
+		memset(work->target, 0x00, sizeof(work->target));
+		return true;
+	}
+
+	/* fill out work request message */
+	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
+	if (!wc)
+		return false;
+
+	wc->cmd = WC_GET_WORK;
+	wc->thr = thr;
+
+	/* send work request to workio thread */
+	if (!tq_push(thr_info[work_thr_id].q, wc)) {
+		workio_cmd_free(wc);
+		return false;
+	}
+
+	/* wait for response, a unit of work */
+	work_heap = (struct work *)tq_pop(thr->q, NULL);
+	if (!work_heap)
+		return false;
+
+	/* copy returned work into storage provided by caller */
+	memcpy(work, work_heap, sizeof(*work));
+	free(work_heap);
+
+	return true;
+}
+
+static bool submit_work(struct thr_info *thr, const struct work *work_in)
+{
+	struct workio_cmd *wc;
+	/* fill out work request message */
+	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
+	if (!wc)
+		return false;
+
+	wc->u.work = (struct work *)malloc(sizeof(*work_in));
+	if (!wc->u.work)
+		goto err_out;
+
+	wc->cmd = WC_SUBMIT_WORK;
+	wc->thr = thr;
+	memcpy(wc->u.work, work_in, sizeof(*work_in));
+
+	/* send solution to workio thread */
+	if (!tq_push(thr_info[work_thr_id].q, wc))
+		goto err_out;
+
+	return true;
+
+err_out:
+	workio_cmd_free(wc);
+	return false;
+}
+
+static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
+{
+	unsigned char merkle_root[64];
+	int i;
+
+	pthread_mutex_lock(&sctx->work_lock);
+
+	strcpy(work->job_id, sctx->job.job_id);
+	work->xnonce2_len = sctx->xnonce2_size;
+	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
+
+	/* Generate merkle root */
+	if (opt_algo == ALGO_HEAVY)
+		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	else
+		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	for (i = 0; i < sctx->job.merkle_count; i++) {
+		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
+		if (opt_algo == ALGO_HEAVY)
+			heavycoin_hash(merkle_root, merkle_root, 64);
+		else
+			sha256d(merkle_root, merkle_root, 64);
+	}
+	
+	/* Increment extranonce2 */
+	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
+
+	/* Assemble block header */
+	memset(work->data, 0, 128);
+	work->data[0] = le32dec(sctx->job.version);
+	for (i = 0; i < 8; i++)
+		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
+	for (i = 0; i < 8; i++)
+		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+	work->data[17] = le32dec(sctx->job.ntime);
+	work->data[18] = le32dec(sctx->job.nbits);
+	work->data[20] = 0x80000000;
+	work->data[31] = 0x00000280;
+
+	// HeavyCoin
+	if (opt_algo == ALGO_HEAVY) {
+		uint16_t *ext;
+		work->maxvote = 1024;
+		ext = (uint16_t*)(&work->data[20]);
+		ext[0] = opt_vote;
+		ext[1] = be16dec(sctx->job.nreward);
+
+		for (i = 0; i < 20; i++)
+			work->data[i] = be32dec((uint32_t *)&work->data[i]);
+	}
+	//
+
+	pthread_mutex_unlock(&sctx->work_lock);
+
+	if (opt_debug) {
+		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
+		applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x",
+		       work->job_id, xnonce2str, swab32(work->data[17]));
+		free(xnonce2str);
+	}
+
+	if (opt_algo == ALGO_SCRYPT)
+		diff_to_target(work->target, sctx->job.diff / 65536.0);
+	else
+		diff_to_target(work->target, sctx->job.diff);
+}
+
+static void *miner_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	int thr_id = mythr->id;
+	struct work work;
+	uint32_t max_nonce;
+	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
+	unsigned char *scratchbuf = NULL;
+	char s[16];
+	int i;
+
+	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
+
+	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
+	 * and if that fails, then SCHED_BATCH. No need for this to be an
+	 * error if it fails */
+	if (!opt_benchmark) {
+		setpriority(PRIO_PROCESS, 0, 19);
+		drop_policy();
+	}
+
+	/* Cpu affinity only makes sense if the number of threads is a multiple
+	 * of the number of CPUs */
+	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
+		if (!opt_quiet)
+			applog(LOG_INFO, "Binding thread %d to cpu %d",
+			       thr_id, thr_id % num_processors);
+		affine_to_cpu(thr_id, thr_id % num_processors);
+	}
+
+	if (opt_algo == ALGO_SCRYPT)
+	{
+		scratchbuf = scrypt_buffer_alloc();
+	}
+
+	while (1) {
+		unsigned long hashes_done;
+		struct timeval tv_start, tv_end, diff;
+		int64_t max64;
+		int rc;
+
+		if (have_stratum) {
+			while (time(NULL) >= g_work_time + 120)
+				sleep(1);
+			pthread_mutex_lock(&g_work_lock);
+			if (work.data[19] >= end_nonce)
+				stratum_gen_work(&stratum, &g_work);
+		} else {
+			/* obtain new work from internal workio thread */
+			pthread_mutex_lock(&g_work_lock);
+			if (!have_stratum && (!have_longpoll ||
+					time(NULL) >= g_work_time + LP_SCANTIME*3/4 ||
+					work.data[19] >= end_nonce)) {
+				if (unlikely(!get_work(mythr, &g_work))) {
+					applog(LOG_ERR, "work retrieval failed, exiting "
+						"mining thread %d", mythr->id);
+					pthread_mutex_unlock(&g_work_lock);
+					goto out;
+				}
+				g_work_time = have_stratum ? 0 : time(NULL);
+			}
+			if (have_stratum) {
+				pthread_mutex_unlock(&g_work_lock);
+				continue;
+			}
+		}
+		if (memcmp(work.data, g_work.data, 76)) {
+			memcpy(&work, &g_work, sizeof(struct work));
+			work.data[19] = 0xffffffffU / opt_n_threads * thr_id;
+		} else
+			work.data[19]++;
+		pthread_mutex_unlock(&g_work_lock);
+		work_restart[thr_id].restart = 0;
+
+		/* adjust max_nonce to meet target scan time */
+		if (have_stratum)
+			max64 = LP_SCANTIME;
+		else
+			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
+			      - time(NULL);
+		max64 *= (int64_t)thr_hashrates[thr_id];
+		if (max64 <= 0)
+			max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL;
+		if (work.data[19] + max64 > end_nonce)
+			max_nonce = end_nonce;
+		else
+			max_nonce = (uint32_t)(work.data[19] + max64);
+
+		hashes_done = 0;
+		gettimeofday(&tv_start, NULL);
+
+		/* scan nonces for a proof-of-work hash */
+		switch (opt_algo) {
+		case ALGO_SCRYPT:
+			rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target,
+			                     max_nonce, &hashes_done);
+			break;
+
+		case ALGO_SHA256D:
+			rc = scanhash_sha256d(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
+		case ALGO_HEAVY:
+			rc = scanhash_heavy(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, work.maxvote);
+			break;
+
+		case ALGO_FUGUE256:
+			rc = scanhash_fugue256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
+		default:
+			/* should never happen */
+			goto out;
+		}
+
+		/* record scanhash elapsed time */
+		gettimeofday(&tv_end, NULL);
+		timeval_subtract(&diff, &tv_end, &tv_start);
+		if (diff.tv_usec || diff.tv_sec) {
+			pthread_mutex_lock(&stats_lock);
+			thr_hashrates[thr_id] =
+				hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			pthread_mutex_unlock(&stats_lock);
+		}
+		if (!opt_quiet) {
+			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
+				1e-3 * thr_hashrates[thr_id]);
+			applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s",
+				thr_id, hashes_done, s);
+		}
+		if (opt_benchmark && thr_id == opt_n_threads - 1) {
+			double hashrate = 0.;
+			for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
+				hashrate += thr_hashrates[i];
+			if (i == opt_n_threads) {
+				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
+				applog(LOG_INFO, "Total: %s khash/s", s);
+			}
+		}
+
+		/* if nonce found, submit work */
+		if (rc && !opt_benchmark && !submit_work(mythr, &work))
+			break;
+	}
+
+out:
+	tq_freeze(mythr->q);
+
+	return NULL;
+}
+
+static void restart_threads(void)
+{
+	int i;
+
+	for (i = 0; i < opt_n_threads; i++)
+		work_restart[i].restart = 1;
+}
+
+static void *longpoll_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	CURL *curl = NULL;
+	char *copy_start, *hdr_path = NULL, *lp_url = NULL;
+	bool need_slash = false;
+
+	curl = curl_easy_init();
+	if (unlikely(!curl)) {
+		applog(LOG_ERR, "CURL initialization failed");
+		goto out;
+	}
+
+start:
+	hdr_path = (char*)tq_pop(mythr->q, NULL);
+	if (!hdr_path)
+		goto out;
+
+	/* full URL */
+	if (strstr(hdr_path, "://")) {
+		lp_url = hdr_path;
+		hdr_path = NULL;
+	}
+	
+	/* absolute path, on current server */
+	else {
+		copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
+		if (rpc_url[strlen(rpc_url) - 1] != '/')
+			need_slash = true;
+
+		lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2);
+		if (!lp_url)
+			goto out;
+
+		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
+	}
+
+	applog(LOG_INFO, "Long-polling activated for %s", lp_url);
+
+	while (1) {
+		json_t *val, *soval;
+		int err;
+
+		val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req,
+				    false, true, &err);
+		if (have_stratum) {
+			if (val)
+				json_decref(val);
+			goto out;
+		}
+		if (likely(val)) {
+			applog(LOG_INFO, "LONGPOLL detected new block");
+			soval = json_object_get(json_object_get(val, "result"), "submitold");
+			submit_old = soval ? json_is_true(soval) : false;
+			pthread_mutex_lock(&g_work_lock);
+			if (work_decode(json_object_get(val, "result"), &g_work)) {
+				if (opt_debug)
+					applog(LOG_DEBUG, "DEBUG: got new work");
+				time(&g_work_time);
+				restart_threads();
+			}
+			pthread_mutex_unlock(&g_work_lock);
+			json_decref(val);
+		} else {
+			pthread_mutex_lock(&g_work_lock);
+			g_work_time -= LP_SCANTIME;
+			pthread_mutex_unlock(&g_work_lock);
+			if (err == CURLE_OPERATION_TIMEDOUT) {
+				restart_threads();
+			} else {
+				have_longpoll = false;
+				restart_threads();
+				free(hdr_path);
+				free(lp_url);
+				lp_url = NULL;
+				sleep(opt_fail_pause);
+				goto start;
+			}
+		}
+	}
+
+out:
+	free(hdr_path);
+	free(lp_url);
+	tq_freeze(mythr->q);
+	if (curl)
+		curl_easy_cleanup(curl);
+
+	return NULL;
+}
+
+static bool stratum_handle_response(char *buf)
+{
+	json_t *val, *err_val, *res_val, *id_val;
+	json_error_t err;
+	bool ret = false;
+
+	val = JSON_LOADS(buf, &err);
+	if (!val) {
+		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+	id_val = json_object_get(val, "id");
+
+	if (!id_val || json_is_null(id_val) || !res_val)
+		goto out;
+
+	share_result(json_is_true(res_val),
+		err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
+
+	ret = true;
+out:
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+static void *stratum_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	char *s;
+
+	stratum.url = (char*)tq_pop(mythr->q, NULL);
+	if (!stratum.url)
+		goto out;
+	applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
+
+	while (1) {
+		int failures = 0;
+
+		while (!stratum.curl) {
+			pthread_mutex_lock(&g_work_lock);
+			g_work_time = 0;
+			pthread_mutex_unlock(&g_work_lock);
+			restart_threads();
+
+			if (!stratum_connect(&stratum, stratum.url) ||
+			    !stratum_subscribe(&stratum) ||
+			    !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
+				stratum_disconnect(&stratum);
+				if (opt_retries >= 0 && ++failures > opt_retries) {
+					applog(LOG_ERR, "...terminating workio thread");
+					tq_push(thr_info[work_thr_id].q, NULL);
+					goto out;
+				}
+				applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+				sleep(opt_fail_pause);
+			}
+		}
+
+		if (stratum.job.job_id &&
+		    (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) {
+			pthread_mutex_lock(&g_work_lock);
+			stratum_gen_work(&stratum, &g_work);
+			time(&g_work_time);
+			pthread_mutex_unlock(&g_work_lock);
+			if (stratum.job.clean) {
+				applog(LOG_INFO, "Stratum detected new block");
+				restart_threads();
+			}
+		}
+		
+		if (!stratum_socket_full(&stratum, 120)) {
+			applog(LOG_ERR, "Stratum connection timed out");
+			s = NULL;
+		} else
+			s = stratum_recv_line(&stratum);
+		if (!s) {
+			stratum_disconnect(&stratum);
+			applog(LOG_ERR, "Stratum connection interrupted");
+			continue;
+		}
+		if (!stratum_handle_method(&stratum, s))
+			stratum_handle_response(s);
+		free(s);
+	}
+
+out:
+	return NULL;
+}
+
+static void show_version_and_exit(void)
+{
+	printf("%s\n%s\n", PACKAGE_STRING, curl_version());
+	exit(0);
+}
+
+static void show_usage_and_exit(int status)
+{
+	if (status)
+		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
+	else
+		printf(usage);
+	exit(status);
+}
+
+static void parse_arg (int key, char *arg)
+{
+	char *p;
+	int v, i;
+
+	switch(key) {
+	case 'a':
+		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
+			if (algo_names[i] &&
+			    !strcmp(arg, algo_names[i])) {
+				opt_algo = (sha256_algos)i;
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(algo_names))
+			show_usage_and_exit(1);
+		break;
+	case 'B':
+		opt_background = true;
+		break;
+	case 'c': {
+		json_error_t err;
+		if (opt_config)
+			json_decref(opt_config);
+#if JANSSON_VERSION_HEX >= 0x020000
+		opt_config = json_load_file(arg, 0, &err);
+#else
+		opt_config = json_load_file(arg, &err);
+#endif
+		if (!json_is_object(opt_config)) {
+			applog(LOG_ERR, "JSON decode of %s failed", arg);
+			exit(1);
+		}
+		break;
+	}
+	case 'q':
+		opt_quiet = true;
+		break;
+	case 'D':
+		opt_debug = true;
+		break;
+	case 'p':
+		free(rpc_pass);
+		rpc_pass = strdup(arg);
+		break;
+	case 'P':
+		opt_protocol = true;
+		break;
+	case 'r':
+		v = atoi(arg);
+		if (v < -1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_retries = v;
+		break;
+	case 'R':
+		v = atoi(arg);
+		if (v < 1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_fail_pause = v;
+		break;
+	case 's':
+		v = atoi(arg);
+		if (v < 1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_scantime = v;
+		break;
+	case 'T':
+		v = atoi(arg);
+		if (v < 1 || v > 99999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_timeout = v;
+		break;
+	case 't':
+		v = atoi(arg);
+		if (v < 1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_n_threads = v;
+		break;
+	case 'v':
+		v = atoi(arg);
+		if (v < 0 || v > 1024)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_vote = (uint16_t)v;
+		break;
+	case 'm':
+		opt_trust_pool = true;
+		break;
+	case 'u':
+		free(rpc_user);
+		rpc_user = strdup(arg);
+		break;
+	case 'o':			/* --url */
+		p = strstr(arg, "://");
+		if (p) {
+			if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
+					strncasecmp(arg, "stratum+tcp://", 14))
+				show_usage_and_exit(1);
+			free(rpc_url);
+			rpc_url = strdup(arg);
+		} else {
+			if (!strlen(arg) || *arg == '/')
+				show_usage_and_exit(1);
+			free(rpc_url);
+			rpc_url = (char*)malloc(strlen(arg) + 8);
+			sprintf(rpc_url, "http://%s", arg);
+		}
+		p = strrchr(rpc_url, '@');
+		if (p) {
+			char *sp, *ap;
+			*p = '\0';
+			ap = strstr(rpc_url, "://") + 3;
+			sp = strchr(ap, ':');
+			if (sp) {
+				free(rpc_userpass);
+				rpc_userpass = strdup(ap);
+				free(rpc_user);
+				rpc_user = (char*)calloc(sp - ap + 1, 1);
+				strncpy(rpc_user, ap, sp - ap);
+				free(rpc_pass);
+				rpc_pass = strdup(sp + 1);
+			} else {
+				free(rpc_user);
+				rpc_user = strdup(ap);
+			}
+			memmove(ap, p + 1, strlen(p + 1) + 1);
+		}
+		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
+		break;
+	case 'O':			/* --userpass */
+		p = strchr(arg, ':');
+		if (!p)
+			show_usage_and_exit(1);
+		free(rpc_userpass);
+		rpc_userpass = strdup(arg);
+		free(rpc_user);
+		rpc_user = (char*)calloc(p - arg + 1, 1);
+		strncpy(rpc_user, arg, p - arg);
+		free(rpc_pass);
+		rpc_pass = strdup(p + 1);
+		break;
+	case 'x':			/* --proxy */
+		if (!strncasecmp(arg, "socks4://", 9))
+			opt_proxy_type = CURLPROXY_SOCKS4;
+		else if (!strncasecmp(arg, "socks5://", 9))
+			opt_proxy_type = CURLPROXY_SOCKS5;
+#if LIBCURL_VERSION_NUM >= 0x071200
+		else if (!strncasecmp(arg, "socks4a://", 10))
+			opt_proxy_type = CURLPROXY_SOCKS4A;
+		else if (!strncasecmp(arg, "socks5h://", 10))
+			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
+#endif
+		else
+			opt_proxy_type = CURLPROXY_HTTP;
+		free(opt_proxy);
+		opt_proxy = strdup(arg);
+		break;
+	case 1001:
+		free(opt_cert);
+		opt_cert = strdup(arg);
+		break;
+	case 1005:
+		opt_benchmark = true;
+		want_longpoll = false;
+		want_stratum = false;
+		have_stratum = false;
+		break;
+	case 1003:
+		want_longpoll = false;
+		break;
+	case 1007:
+		want_stratum = false;
+		break;
+	case 'S':
+		use_syslog = true;
+		break;
+	case 'V':
+		show_version_and_exit();
+	case 'h':
+		show_usage_and_exit(0);
+	default:
+		show_usage_and_exit(1);
+	}
+}
+
+static void parse_config(void)
+{
+	int i;
+	json_t *val;
+
+	if (!json_is_object(opt_config))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(options); i++) {
+		if (!options[i].name)
+			break;
+		if (!strcmp(options[i].name, "config"))
+			continue;
+
+		val = json_object_get(opt_config, options[i].name);
+		if (!val)
+			continue;
+
+		if (options[i].has_arg && json_is_string(val)) {
+			char *s = strdup(json_string_value(val));
+			if (!s)
+				break;
+			parse_arg(options[i].val, s);
+			free(s);
+		} else if (!options[i].has_arg && json_is_true(val))
+			parse_arg(options[i].val, "");
+		else
+			applog(LOG_ERR, "JSON option %s invalid",
+				options[i].name);
+	}
+
+        if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
+            fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n");
+            show_usage_and_exit(1);
+        }
+}
+
+static void parse_cmdline(int argc, char *argv[])
+{
+	int key;
+
+	while (1) {
+#if HAVE_GETOPT_LONG
+		key = getopt_long(argc, argv, short_options, options, NULL);
+#else
+		key = getopt(argc, argv, short_options);
+#endif
+		if (key < 0)
+			break;
+
+		parse_arg(key, optarg);
+	}
+	if (optind < argc) {
+		fprintf(stderr, "%s: unsupported non-option argument '%s'\n",
+			argv[0], argv[optind]);
+		show_usage_and_exit(1);
+	}
+
+        if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
+		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
+			argv[0]);
+		show_usage_and_exit(1);
+        }
+
+	parse_config();
+}
+
+#ifndef WIN32
+static void signal_handler(int sig)
+{
+	switch (sig) {
+	case SIGHUP:
+		applog(LOG_INFO, "SIGHUP received");
+		break;
+	case SIGINT:
+		applog(LOG_INFO, "SIGINT received, exiting");
+		exit(0);
+		break;
+	case SIGTERM:
+		applog(LOG_INFO, "SIGTERM received, exiting");
+		exit(0);
+		break;
+	}
+}
+#endif
+
+#define PROGRAM_VERSION "0.1"
+int main(int argc, char *argv[])
+{
+	struct thr_info *thr;
+	long flags;
+	int i;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+#endif
+
+	printf("     *** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
+	printf("\t             This is version "PROGRAM_VERSION" (beta)\n");
+	printf("\t  based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
+	printf("\t  based on pooler-cpuminer extension for HVC from\n\t       https://github.com/heavycoin/cpuminer-heavycoin\n");
+	printf("\t\t\tand\n\t       http://hvc.1gh.com/\n");
+	printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
+	printf("\t  LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm\n");
+	printf("\t  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
+	printf("\t  YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4\n");
+
+	rpc_user = strdup("");
+	rpc_pass = strdup("");
+
+	/* parse command line */
+	parse_cmdline(argc, argv);
+
+	if (!opt_benchmark && !rpc_url) {
+		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
+		show_usage_and_exit(1);
+	}
+
+	if (!rpc_userpass) {
+		rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
+		if (!rpc_userpass)
+			return 1;
+		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
+	}
+
+	pthread_mutex_init(&applog_lock, NULL);
+	pthread_mutex_init(&stats_lock, NULL);
+	pthread_mutex_init(&g_work_lock, NULL);
+	pthread_mutex_init(&stratum.sock_lock, NULL);
+	pthread_mutex_init(&stratum.work_lock, NULL);
+
+	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
+	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
+	      : CURL_GLOBAL_ALL;
+	if (curl_global_init(flags)) {
+		applog(LOG_ERR, "CURL initialization failed");
+		return 1;
+	}
+
+#ifndef WIN32
+	if (opt_background) {
+		i = fork();
+		if (i < 0) exit(1);
+		if (i > 0) exit(0);
+		i = setsid();
+		if (i < 0)
+			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
+		i = chdir("/");
+		if (i < 0)
+			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
+		signal(SIGHUP, signal_handler);
+		signal(SIGINT, signal_handler);
+		signal(SIGTERM, signal_handler);
+	}
+#endif
+
+	num_processors = cuda_num_devices();
+	if (num_processors == 0)
+	{
+		applog(LOG_ERR, "No CUDA devices found! terminating.");
+		exit(1);
+	}
+	if (!opt_n_threads)
+		opt_n_threads = num_processors;
+
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog)
+		openlog("cpuminer", LOG_PID, LOG_USER);
+#endif
+
+	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
+	if (!work_restart)
+		return 1;
+
+	thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr));
+	if (!thr_info)
+		return 1;
+	
+	thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
+	if (!thr_hashrates)
+		return 1;
+
+	/* init workio thread info */
+	work_thr_id = opt_n_threads;
+	thr = &thr_info[work_thr_id];
+	thr->id = work_thr_id;
+	thr->q = tq_new();
+	if (!thr->q)
+		return 1;
+
+	/* start work I/O thread */
+	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
+		applog(LOG_ERR, "workio thread create failed");
+		return 1;
+	}
+
+	if (want_longpoll && !have_stratum) {
+		/* init longpoll thread info */
+		longpoll_thr_id = opt_n_threads + 1;
+		thr = &thr_info[longpoll_thr_id];
+		thr->id = longpoll_thr_id;
+		thr->q = tq_new();
+		if (!thr->q)
+			return 1;
+
+		/* start longpoll thread */
+		if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
+			applog(LOG_ERR, "longpoll thread create failed");
+			return 1;
+		}
+	}
+	if (want_stratum) {
+		/* init stratum thread info */
+		stratum_thr_id = opt_n_threads + 2;
+		thr = &thr_info[stratum_thr_id];
+		thr->id = stratum_thr_id;
+		thr->q = tq_new();
+		if (!thr->q)
+			return 1;
+
+		/* start stratum thread */
+		if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
+			applog(LOG_ERR, "stratum thread create failed");
+			return 1;
+		}
+
+		if (have_stratum)
+			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
+	}
+
+	/* start mining threads */
+	for (i = 0; i < opt_n_threads; i++) {
+		thr = &thr_info[i];
+
+		thr->id = i;
+		thr->q = tq_new();
+		if (!thr->q)
+			return 1;
+
+		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
+			applog(LOG_ERR, "thread %d create failed", i);
+			return 1;
+		}
+	}
+
+	applog(LOG_INFO, "%d miner threads started, "
+		"using '%s' algorithm.",
+		opt_n_threads,
+		algo_names[opt_algo]);
+
+	/* main loop - simply wait for workio thread to exit */
+	pthread_join(thr_info[work_thr_id].pth, NULL);
+
+	applog(LOG_INFO, "workio thread dead, exiting.");
+
+	return 0;
+}
diff --git a/cpuminer-config.h b/cpuminer-config.h
new file mode 100644
index 0000000..bdd09fe
--- /dev/null
+++ b/cpuminer-config.h
@@ -0,0 +1,190 @@
+/* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+#undef CRAY_STACKSEG_END
+
+/* Define to 1 if using `alloca.c'. */
+#undef C_ALLOCA
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#undef HAVE_ALLOCA
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#undef HAVE_ALLOCA_H
+
+/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
+   don't. */
+#undef HAVE_DECL_BE32DEC
+
+/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
+   don't. */
+#undef HAVE_DECL_BE32ENC
+
+/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
+   don't. */
+#undef HAVE_DECL_LE32DEC
+
+/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
+   don't. */
+#undef HAVE_DECL_LE32ENC
+
+/* Define to 1 if you have the `getopt_long' function. */
+#define HAVE_GETOPT_LONG 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have a functional curl library. */
+#undef HAVE_LIBCURL
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <syslog.h> header file. */
+#undef HAVE_SYSLOG_H
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+#undef HAVE_SYS_ENDIAN_H
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#undef HAVE_SYS_PARAM_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Defined if libcurl supports AsynchDNS */
+#undef LIBCURL_FEATURE_ASYNCHDNS
+
+/* Defined if libcurl supports IDN */
+#undef LIBCURL_FEATURE_IDN
+
+/* Defined if libcurl supports IPv6 */
+#undef LIBCURL_FEATURE_IPV6
+
+/* Defined if libcurl supports KRB4 */
+#undef LIBCURL_FEATURE_KRB4
+
+/* Defined if libcurl supports libz */
+#undef LIBCURL_FEATURE_LIBZ
+
+/* Defined if libcurl supports NTLM */
+#undef LIBCURL_FEATURE_NTLM
+
+/* Defined if libcurl supports SSL */
+#undef LIBCURL_FEATURE_SSL
+
+/* Defined if libcurl supports SSPI */
+#undef LIBCURL_FEATURE_SSPI
+
+/* Defined if libcurl supports DICT */
+#undef LIBCURL_PROTOCOL_DICT
+
+/* Defined if libcurl supports FILE */
+#undef LIBCURL_PROTOCOL_FILE
+
+/* Defined if libcurl supports FTP */
+#undef LIBCURL_PROTOCOL_FTP
+
+/* Defined if libcurl supports FTPS */
+#undef LIBCURL_PROTOCOL_FTPS
+
+/* Defined if libcurl supports HTTP */
+#undef LIBCURL_PROTOCOL_HTTP
+
+/* Defined if libcurl supports HTTPS */
+#undef LIBCURL_PROTOCOL_HTTPS
+
+/* Defined if libcurl supports IMAP */
+#undef LIBCURL_PROTOCOL_IMAP
+
+/* Defined if libcurl supports LDAP */
+#undef LIBCURL_PROTOCOL_LDAP
+
+/* Defined if libcurl supports POP3 */
+#undef LIBCURL_PROTOCOL_POP3
+
+/* Defined if libcurl supports RTSP */
+#undef LIBCURL_PROTOCOL_RTSP
+
+/* Defined if libcurl supports SMTP */
+#undef LIBCURL_PROTOCOL_SMTP
+
+/* Defined if libcurl supports TELNET */
+#undef LIBCURL_PROTOCOL_TELNET
+
+/* Defined if libcurl supports TFTP */
+#undef LIBCURL_PROTOCOL_TFTP
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+#undef NO_MINUS_C_MINUS_O
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "ccminer"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "ccminer 2014.03.18"
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2014.03.18"
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+#undef STACK_DIRECTION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define to 1 if AVX assembly is available. */
+#undef USE_AVX
+
+/* Define to 1 if XOP assembly is available. */
+#undef USE_XOP
+
+/* Version number of package */
+#undef VERSION
+
+/* Define curl_free() as free() if our version of curl lacks curl_free. */
+#undef curl_free
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/cpuminer-config.h.in b/cpuminer-config.h.in
new file mode 100644
index 0000000..b8668be
--- /dev/null
+++ b/cpuminer-config.h.in
@@ -0,0 +1,199 @@
+/* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+#undef CRAY_STACKSEG_END
+
+/* Define to 1 if using `alloca.c'. */
+#undef C_ALLOCA
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#undef HAVE_ALLOCA
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#undef HAVE_ALLOCA_H
+
+/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
+   don't. */
+#undef HAVE_DECL_BE32DEC
+
+/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
+   don't. */
+#undef HAVE_DECL_BE32ENC
+
+/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
+   don't. */
+#undef HAVE_DECL_LE32DEC
+
+/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
+   don't. */
+#undef HAVE_DECL_LE32ENC
+
+/* Define to 1 if you have the `getopt_long' function. */
+#undef HAVE_GETOPT_LONG
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the `crypto' library (-lcrypto). */
+#undef HAVE_LIBCRYPTO
+
+/* Define to 1 if you have a functional curl library. */
+#undef HAVE_LIBCURL
+
+/* Define to 1 if you have the `ssl' library (-lssl). */
+#undef HAVE_LIBSSL
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <syslog.h> header file. */
+#undef HAVE_SYSLOG_H
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+#undef HAVE_SYS_ENDIAN_H
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#undef HAVE_SYS_PARAM_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Defined if libcurl supports AsynchDNS */
+#undef LIBCURL_FEATURE_ASYNCHDNS
+
+/* Defined if libcurl supports IDN */
+#undef LIBCURL_FEATURE_IDN
+
+/* Defined if libcurl supports IPv6 */
+#undef LIBCURL_FEATURE_IPV6
+
+/* Defined if libcurl supports KRB4 */
+#undef LIBCURL_FEATURE_KRB4
+
+/* Defined if libcurl supports libz */
+#undef LIBCURL_FEATURE_LIBZ
+
+/* Defined if libcurl supports NTLM */
+#undef LIBCURL_FEATURE_NTLM
+
+/* Defined if libcurl supports SSL */
+#undef LIBCURL_FEATURE_SSL
+
+/* Defined if libcurl supports SSPI */
+#undef LIBCURL_FEATURE_SSPI
+
+/* Defined if libcurl supports DICT */
+#undef LIBCURL_PROTOCOL_DICT
+
+/* Defined if libcurl supports FILE */
+#undef LIBCURL_PROTOCOL_FILE
+
+/* Defined if libcurl supports FTP */
+#undef LIBCURL_PROTOCOL_FTP
+
+/* Defined if libcurl supports FTPS */
+#undef LIBCURL_PROTOCOL_FTPS
+
+/* Defined if libcurl supports HTTP */
+#undef LIBCURL_PROTOCOL_HTTP
+
+/* Defined if libcurl supports HTTPS */
+#undef LIBCURL_PROTOCOL_HTTPS
+
+/* Defined if libcurl supports IMAP */
+#undef LIBCURL_PROTOCOL_IMAP
+
+/* Defined if libcurl supports LDAP */
+#undef LIBCURL_PROTOCOL_LDAP
+
+/* Defined if libcurl supports POP3 */
+#undef LIBCURL_PROTOCOL_POP3
+
+/* Defined if libcurl supports RTSP */
+#undef LIBCURL_PROTOCOL_RTSP
+
+/* Defined if libcurl supports SMTP */
+#undef LIBCURL_PROTOCOL_SMTP
+
+/* Defined if libcurl supports TELNET */
+#undef LIBCURL_PROTOCOL_TELNET
+
+/* Defined if libcurl supports TFTP */
+#undef LIBCURL_PROTOCOL_TFTP
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+#undef NO_MINUS_C_MINUS_O
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+#undef STACK_DIRECTION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define to 1 if AVX assembly is available. */
+#undef USE_AVX
+
+/* Define to 1 if AVX2 assembly is available. */
+#undef USE_AVX2
+
+/* Define to 1 if XOP assembly is available. */
+#undef USE_XOP
+
+/* Version number of package */
+#undef VERSION
+
+/* Define curl_free() as free() if our version of curl lacks curl_free. */
+#undef curl_free
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/cuda_blake512.cu b/cuda_blake512.cu
new file mode 100644
index 0000000..325901d
--- /dev/null
+++ b/cuda_blake512.cu
@@ -0,0 +1,308 @@
+/* Diese Funktion ist auf 84+32-Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// globaler Speicher f�r alle HeftyHashes aller Threads
+extern uint32_t *d_heftyHashes[8];
+extern uint32_t *d_nonceVector[8];
+
+// globaler Speicher f�r unsere Ergebnisse
+uint32_t *d_hash5output[8];
+
+// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU
+__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding)
+
+// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
+
+__constant__ uint8_t c_sigma[16][16];
+
+const uint8_t host_sigma[16][16] =
+{
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+#define SWAP32(x) \
+    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
+      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+#define SWAP64(x) \
+    ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+                (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+                (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+                (((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+                (((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+                (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+                (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+                (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+
+__constant__ uint64_t c_SecondRound[16];
+
+const uint64_t host_SecondRound[16] =
+{
+  0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0)
+};
+
+__constant__ uint64_t c_u512[16];
+
+const uint64_t host_u512[16] =
+{
+  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
+  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
+  0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
+  0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
+  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+
+#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n)))
+
+#define G(a,b,c,d,e)          \
+    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
+    v[d] = ROTR( v[d] ^ v[a],32);        \
+    v[c] += v[d];           \
+    v[b] = ROTR( v[b] ^ v[c],25);        \
+    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
+    v[d] = ROTR( v[d] ^ v[a],16);        \
+    v[c] += v[d];           \
+    v[b] = ROTR( v[b] ^ v[c],11);
+
+__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 )
+{
+    uint64_t v[16], m[16], i;
+
+#pragma unroll 16
+    for( i = 0; i < 16; ++i )  m[i] = SWAP64(block[i]);
+
+#pragma unroll 8
+    for( i = 0; i < 8; ++i )  v[i] = h[i];
+
+    v[ 8] = u512[0];
+    v[ 9] = u512[1];
+    v[10] = u512[2];
+    v[11] = u512[3];
+    v[12] = u512[4];
+    v[13] = u512[5];
+    v[14] = u512[6];
+    v[15] = u512[7];
+
+    /* don't xor t when the block is only padding */
+    if ( !nullt ) {
+        v[12] ^= 928;
+        v[13] ^= 928;
+    }
+
+#pragma unroll 16
+    for( i = 0; i < 16; ++i )
+    {
+        /* column step */
+        G( 0, 4, 8, 12, 0 );
+        G( 1, 5, 9, 13, 2 );
+        G( 2, 6, 10, 14, 4 );
+        G( 3, 7, 11, 15, 6 );
+        /* diagonal step */
+        G( 0, 5, 10, 15, 8 );
+        G( 1, 6, 11, 12, 10 );
+        G( 2, 7, 8, 13, 12 );
+        G( 3, 4, 9, 14, 14 );
+    }
+
+#pragma unroll 16
+    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
+}
+
+// Endian Drehung f�r 32 Bit Typen
+static __device__ uint32_t cuda_swab32(uint32_t x)
+{
+    return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+          | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+// Endian Drehung f�r 64 Bit Typen
+static __device__ uint64_t cuda_swab64(uint64_t x) {
+    uint32_t h = (x >> 32);
+    uint32_t l = (x & 0xFFFFFFFFULL);
+    return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
+}
+
+// das Hi Word aus einem 64 Bit Typen extrahieren
+static __device__ uint32_t HIWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2hiint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x >> 32);
+#endif
+}
+
+// das Hi Word in einem 64 Bit Typen ersetzen
+static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
+}
+
+// das Lo Word aus einem 64 Bit Typen extrahieren
+static __device__ uint32_t LOWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2loint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x & 0xFFFFFFFFULL);
+#endif
+}
+
+// das Lo Word in einem 64 Bit Typen ersetzen
+static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// bestimme den aktuellen Z�hler
+		//uint32_t nounce = startNounce + thread;
+		uint32_t nounce = nonceVector[thread];
+
+		// Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash)
+		uint32_t hashPosition = nounce - startNounce;
+
+		// State vorbereiten
+		uint64_t h[8];
+		h[0] = 0x6a09e667f3bcc908ULL;
+		h[1] = 0xbb67ae8584caa73bULL;
+		h[2] = 0x3c6ef372fe94f82bULL;
+		h[3] = 0xa54ff53a5f1d36f1ULL;
+		h[4] = 0x510e527fade682d1ULL;
+		h[5] = 0x9b05688c2b3e6c1fULL;
+		h[6] = 0x1f83d9abfb41bd6bULL;
+		h[7] = 0x5be0cd19137e2179ULL;
+
+		// 128 Byte f�r die Message
+		uint64_t buf[16];
+
+		// Message f�r die erste Runde in Register holen
+#pragma unroll 16
+		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
+
+		// die Nounce durch die thread-spezifische ersetzen
+		buf[9] = REPLACE_HIWORD(buf[9], nounce);
+
+		// den thread-spezifischen Hefty1 hash einsetzen
+		uint32_t *hefty = heftyHashes + 8 * hashPosition;
+		buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
+		buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
+		buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
+		buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
+		buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
+		buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
+		buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
+		buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
+
+		// erste Runde
+		blake512_compress( h, buf, 0, c_sigma, c_u512 );
+
+		// zweite Runde
+#pragma unroll 16
+		for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i];
+		blake512_compress( h, buf, 1, c_sigma, c_u512 );
+
+		// Hash rauslassen
+#if 0
+		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verf�gbar sind
+		uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition;
+#pragma unroll 8
+		for (int i=0; i < 8; ++i) {
+			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
+		}
+#else
+		// in dieser Version passieren auch ein paar 64 Bit Shifts
+		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
+#pragma unroll 8
+		for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] );
+#endif
+	}
+}
+
+
+// ---------------------------- END CUDA blake512 functions ------------------------------------
+
+// Setup-Funktionen
+__host__ void blake512_cpu_init(int thr_id, int threads)
+{
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol( c_sigma,
+						host_sigma,
+						sizeof(host_sigma),
+						0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol( c_u512,
+						host_u512,
+						sizeof(host_u512),
+						0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol( c_SecondRound,
+						host_SecondRound,
+						sizeof(host_SecondRound),
+						0, cudaMemcpyHostToDevice);
+
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
+}
+
+__host__ void blake512_cpu_setBlock(void *pdata)
+	// data muss 84-Byte haben!
+	// heftyHash hat 32-Byte
+{
+	// Message mit Padding f�r erste Runde bereitstellen
+	unsigned char PaddedMessage[128];
+	memcpy(PaddedMessage, pdata, 84);
+	memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einf�llen
+	memset(PaddedMessage+116, 0, 12);
+	PaddedMessage[116] = 0x80;
+
+	// die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU
+	cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+
+__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+{
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	size_t shared_size = 0;
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	blake512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+}
diff --git a/cuda_blake512.h b/cuda_blake512.h
new file mode 100644
index 0000000..b0cf201
--- /dev/null
+++ b/cuda_blake512.h
@@ -0,0 +1,8 @@
+#ifndef _CUDA_BLAKE512_H
+#define _CUDA_BLAKE512_H
+
+void blake512_cpu_init(int thr_id, int threads);
+void blake512_cpu_setBlock(void *pdata);
+void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+#endif
diff --git a/cuda_combine.cu b/cuda_combine.cu
new file mode 100644
index 0000000..2949765
--- /dev/null
+++ b/cuda_combine.cu
@@ -0,0 +1,151 @@
+/* Diese Funktion ist auf 84+32 Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned int uint32_t;
+
+// globaler Speicher f�r unsere Ergebnisse
+uint32_t *d_hashoutput[8];
+
+extern uint32_t *d_hash2output[8];
+extern uint32_t *d_hash3output[8];
+extern uint32_t *d_hash4output[8];
+extern uint32_t *d_hash5output[8];
+extern uint32_t *d_nonceVector[8];
+
+/* Combines top 64-bits from each hash into a single hash */
+static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+{
+	uint32_t lout[8]; // Combining in Registern machen
+
+#pragma unroll 8
+	for (int i=0; i < 8; ++i)
+		lout[i] = 0;
+
+	// das Makro setzt jeweils 4 Bits aus vier verschiedenen Hashes zu einem Nibble zusammen
+#define MIX(bits, mask, i) \
+	lout[(255 - (bits+3))/32] <<= 4; \
+	if ((hash1[i] & mask) != 0) lout[(255 - (bits+0))/32] |= 8; \
+	if ((hash2[i] & mask) != 0) lout[(255 - (bits+1))/32] |= 4; \
+	if ((hash3[i] & mask) != 0) lout[(255 - (bits+2))/32] |= 2; \
+	if ((hash4[i] & mask) != 0) lout[(255 - (bits+3))/32] |= 1; \
+
+	/* Transpose first 64 bits of each hash into out */
+	MIX(  0, 0x80000000, 7);
+	MIX(  4, 0x40000000, 7);
+	MIX(  8, 0x20000000, 7);
+	MIX( 12, 0x10000000, 7);
+	MIX( 16, 0x08000000, 7);
+	MIX( 20, 0x04000000, 7);
+	MIX( 24, 0x02000000, 7);
+	MIX( 28, 0x01000000, 7);
+	MIX( 32, 0x00800000, 7);
+	MIX( 36, 0x00400000, 7);
+	MIX( 40, 0x00200000, 7);
+	MIX( 44, 0x00100000, 7);
+	MIX( 48, 0x00080000, 7);
+	MIX( 52, 0x00040000, 7);
+	MIX( 56, 0x00020000, 7);
+	MIX( 60, 0x00010000, 7);
+	MIX( 64, 0x00008000, 7);
+	MIX( 68, 0x00004000, 7);
+	MIX( 72, 0x00002000, 7);
+	MIX( 76, 0x00001000, 7);
+	MIX( 80, 0x00000800, 7);
+	MIX( 84, 0x00000400, 7);
+	MIX( 88, 0x00000200, 7);
+	MIX( 92, 0x00000100, 7);
+	MIX( 96, 0x00000080, 7);
+	MIX(100, 0x00000040, 7);
+	MIX(104, 0x00000020, 7);
+	MIX(108, 0x00000010, 7);
+	MIX(112, 0x00000008, 7);
+	MIX(116, 0x00000004, 7);
+	MIX(120, 0x00000002, 7);
+	MIX(124, 0x00000001, 7);
+
+	MIX(128, 0x80000000, 6);
+	MIX(132, 0x40000000, 6);
+	MIX(136, 0x20000000, 6);
+	MIX(140, 0x10000000, 6);
+	MIX(144, 0x08000000, 6);
+	MIX(148, 0x04000000, 6);
+	MIX(152, 0x02000000, 6);
+	MIX(156, 0x01000000, 6);
+	MIX(160, 0x00800000, 6);
+	MIX(164, 0x00400000, 6);
+	MIX(168, 0x00200000, 6);
+	MIX(172, 0x00100000, 6);
+	MIX(176, 0x00080000, 6);
+	MIX(180, 0x00040000, 6);
+	MIX(184, 0x00020000, 6);
+	MIX(188, 0x00010000, 6);
+	MIX(192, 0x00008000, 6);
+	MIX(196, 0x00004000, 6);
+	MIX(200, 0x00002000, 6);
+	MIX(204, 0x00001000, 6);
+	MIX(208, 0x00000800, 6);
+	MIX(212, 0x00000400, 6);
+	MIX(216, 0x00000200, 6);
+	MIX(220, 0x00000100, 6);
+	MIX(224, 0x00000080, 6);
+	MIX(228, 0x00000040, 6);
+	MIX(232, 0x00000020, 6);
+	MIX(236, 0x00000010, 6);
+	MIX(240, 0x00000008, 6);
+	MIX(244, 0x00000004, 6);
+	MIX(248, 0x00000002, 6);
+	MIX(252, 0x00000001, 6);
+
+#pragma unroll 8
+	for (int i=0; i < 8; ++i)
+		out[i] = lout[i];
+}
+
+__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = nonceVector[thread];
+		uint32_t hashPosition = nounce - startNounce;
+		// Die Aufgabe der combine-funktion besteht aus zwei Teilen.
+		// 1) Komprimiere die hashes zu einem kleinen Array
+		// 2) Errechne dort den combines-value
+
+		// Die Kompression wird dadurch verwirklicht, dass im out-array weiterhin mit "thread" indiziert
+		// wird. Die anderen Werte werden mit der nonce indiziert
+
+		combine_hashes(&out[8 * thread], &hash2[8 * hashPosition], &hash3[16 * hashPosition], &hash4[16 * hashPosition], &hash5[16 * hashPosition]);
+	}
+}
+
+// Setup-Funktionen
+__host__ void combine_cpu_init(int thr_id, int threads)
+{
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+}
+
+void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
+{
+	// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
+
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	size_t shared_size = 0;
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
+
+	// da die Hash Auswertung noch auf der CPU erfolgt, m�ssen die Ergebnisse auf jeden Fall zum Host kopiert werden
+	cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+}
diff --git a/cuda_combine.h b/cuda_combine.h
new file mode 100644
index 0000000..ada3a21
--- /dev/null
+++ b/cuda_combine.h
@@ -0,0 +1,7 @@
+#ifndef _CUDA_COMBINE_H
+#define _CUDA_COMBINE_H
+
+void combine_cpu_init(int thr_id, int threads);
+void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
+
+#endif
diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu
new file mode 100644
index 0000000..0457130
--- /dev/null
+++ b/cuda_fugue256.cu
@@ -0,0 +1,782 @@
+#if 1
+/* Diese Funktion ist auf 84+32 Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#include "sph_fugue.h"
+
+// heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// schon in sph_fugue.h definiert
+//#define SPH_C32(x)	((uint32_t)(x ## U))
+
+uint32_t *d_fugue256_hashoutput[8];
+uint32_t *d_resultNonce[8];
+
+__constant__ uint32_t GPUstate[30]; // Single GPU
+__constant__ uint32_t pTarget[8]; // Single GPU
+
+//__constant__ uint32_t mixtab0[256]; // 1K
+//__constant__ uint32_t mixtab1[256]; // 1K
+//__constant__ uint32_t mixtab2[256]; // 1K
+//__constant__ uint32_t mixtab3[256]; // 1K
+
+texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
+texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
+texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
+
+#define mixtab0(x) tex1Dfetch(mixTab0Tex, x)
+#define mixtab1(x) tex1Dfetch(mixTab1Tex, x)
+#define mixtab2(x) tex1Dfetch(mixTab2Tex, x)
+#define mixtab3(x) tex1Dfetch(mixTab3Tex, x)
+
+/* TABELLEN */
+static const uint32_t mixtab0_cpu[] = {
+	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
+	SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7),
+	SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0),
+	SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac),
+	SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a),
+	SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e),
+	SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5),
+	SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed),
+	SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be),
+	SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6),
+	SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea),
+	SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298),
+	SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1),
+	SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2),
+	SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf),
+	SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54),
+	SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c),
+	SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e),
+	SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c),
+	SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5),
+	SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe),
+	SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a),
+	SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c),
+	SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6),
+	SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475),
+	SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1),
+	SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257),
+	SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599),
+	SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2),
+	SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701),
+	SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33),
+	SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11),
+	SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e),
+	SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f),
+	SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f),
+	SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7),
+	SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a),
+	SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba),
+	SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e),
+	SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9),
+	SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045),
+	SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1),
+	SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519),
+	SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d),
+	SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b),
+	SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa),
+	SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b),
+	SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7),
+	SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627),
+	SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8),
+	SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903),
+	SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50),
+	SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c),
+	SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8),
+	SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f),
+	SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b),
+	SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986),
+	SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662),
+	SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1),
+	SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af),
+	SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123),
+	SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943),
+	SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f),
+	SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820),
+	SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94),
+	SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae),
+	SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d),
+	SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c),
+	SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a),
+	SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8),
+	SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b),
+	SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38),
+	SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47),
+	SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129),
+	SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9),
+	SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744),
+	SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e),
+	SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78),
+	SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15),
+	SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51),
+	SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212),
+	SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5),
+	SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f),
+	SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c),
+	SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda),
+	SPH_C32(0x16166258)
+};
+
+static const uint32_t mixtab1_cpu[] = {
+	SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e),
+	SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a),
+	SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090),
+	SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1),
+	SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c),
+	SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3),
+	SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0),
+	SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6),
+	SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243),
+	SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451),
+	SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728),
+	SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2),
+	SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3),
+	SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556),
+	SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c),
+	SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b),
+	SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9),
+	SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf),
+	SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715),
+	SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298),
+	SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233),
+	SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4),
+	SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41),
+	SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d),
+	SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614),
+	SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f),
+	SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2),
+	SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5),
+	SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a),
+	SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47),
+	SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed),
+	SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b),
+	SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b),
+	SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8),
+	SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0),
+	SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66),
+	SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0),
+	SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344),
+	SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3),
+	SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd),
+	SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30),
+	SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb),
+	SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55),
+	SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2),
+	SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7),
+	SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758),
+	SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427),
+	SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342),
+	SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6),
+	SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6),
+	SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9),
+	SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c),
+	SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31),
+	SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e),
+	SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4),
+	SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f),
+	SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69),
+	SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6),
+	SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783),
+	SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18),
+	SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1),
+	SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9),
+	SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520),
+	SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838),
+	SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb),
+	SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f),
+	SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae),
+	SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d),
+	SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c),
+	SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba),
+	SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3),
+	SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a),
+	SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be),
+	SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171),
+	SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191),
+	SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177),
+	SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87),
+	SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a),
+	SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c),
+	SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b),
+	SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992),
+	SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684),
+	SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc),
+	SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d),
+	SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c),
+	SPH_C32(0x58161662)
+};
+
+static const uint32_t mixtab2_cpu[] = {
+	SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777),
+	SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b),
+	SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030),
+	SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b),
+	SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab),
+	SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282),
+	SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa),
+	SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0),
+	SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2),
+	SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4),
+	SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7),
+	SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626),
+	SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7),
+	SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5),
+	SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171),
+	SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515),
+	SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323),
+	SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696),
+	SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707),
+	SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2),
+	SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2),
+	SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383),
+	SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b),
+	SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0),
+	SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6),
+	SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3),
+	SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353),
+	SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded),
+	SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1),
+	SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb),
+	SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a),
+	SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf),
+	SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa),
+	SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d),
+	SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545),
+	SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f),
+	SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f),
+	SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3),
+	SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292),
+	SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5),
+	SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada),
+	SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff),
+	SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd),
+	SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec),
+	SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444),
+	SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7),
+	SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464),
+	SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373),
+	SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f),
+	SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a),
+	SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646),
+	SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414),
+	SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b),
+	SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232),
+	SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949),
+	SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c),
+	SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac),
+	SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595),
+	SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7),
+	SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d),
+	SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e),
+	SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656),
+	SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565),
+	SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808),
+	SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525),
+	SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6),
+	SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8),
+	SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f),
+	SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b),
+	SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e),
+	SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848),
+	SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e),
+	SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757),
+	SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1),
+	SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1),
+	SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111),
+	SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e),
+	SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e),
+	SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece),
+	SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf),
+	SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989),
+	SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6),
+	SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141),
+	SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f),
+	SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb),
+	SPH_C32(0x62581616)
+};
+
+static const uint32_t mixtab3_cpu[] = {
+	SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777),
+	SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b),
+	SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030),
+	SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b),
+	SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab),
+	SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82),
+	SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa),
+	SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0),
+	SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2),
+	SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4),
+	SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7),
+	SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826),
+	SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7),
+	SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5),
+	SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71),
+	SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415),
+	SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23),
+	SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96),
+	SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07),
+	SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2),
+	SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2),
+	SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83),
+	SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b),
+	SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0),
+	SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6),
+	SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3),
+	SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753),
+	SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed),
+	SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1),
+	SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb),
+	SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a),
+	SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf),
+	SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa),
+	SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d),
+	SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45),
+	SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f),
+	SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f),
+	SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3),
+	SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92),
+	SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5),
+	SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da),
+	SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff),
+	SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd),
+	SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec),
+	SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44),
+	SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7),
+	SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64),
+	SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773),
+	SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f),
+	SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a),
+	SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346),
+	SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014),
+	SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b),
+	SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832),
+	SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49),
+	SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c),
+	SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac),
+	SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295),
+	SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7),
+	SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d),
+	SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e),
+	SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356),
+	SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65),
+	SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008),
+	SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425),
+	SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6),
+	SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8),
+	SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f),
+	SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b),
+	SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e),
+	SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48),
+	SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e),
+	SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757),
+	SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1),
+	SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1),
+	SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411),
+	SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e),
+	SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e),
+	SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce),
+	SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df),
+	SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289),
+	SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6),
+	SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41),
+	SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f),
+	SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb),
+	SPH_C32(0x16625816)
+};
+
+#define TIX2(q, x00, x01, x08, x10, x24) { \
+		x10 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x24; \
+	}
+
+#define TIX3(q, x00, x01, x04, x08, x16, x27, x30) { \
+		x16 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x27; \
+		x04 ^= x30; \
+	}
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
+		x22 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x24; \
+		x04 ^= x27; \
+		x07 ^= x30; \
+	}
+
+#define CMIX30(x00, x01, x02, x04, x05, x06, x15, x16, x17) { \
+		x00 ^= x04; \
+		x01 ^= x05; \
+		x02 ^= x06; \
+		x15 ^= x04; \
+		x16 ^= x05; \
+		x17 ^= x06; \
+	}
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
+		x00 ^= x04; \
+		x01 ^= x05; \
+		x02 ^= x06; \
+		x18 ^= x04; \
+		x19 ^= x05; \
+		x20 ^= x06; \
+	}
+
+#define SMIX(x0, x1, x2, x3) { \
+		uint32_t c0 = 0; \
+		uint32_t c1 = 0; \
+		uint32_t c2 = 0; \
+		uint32_t c3 = 0; \
+		uint32_t r0 = 0; \
+		uint32_t r1 = 0; \
+		uint32_t r2 = 0; \
+		uint32_t r3 = 0; \
+		uint32_t tmp; \
+		tmp = mixtab0(x0 >> 24); \
+		c0 ^= tmp; \
+		tmp = mixtab1((x0 >> 16) & 0xFF); \
+		c0 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2((x0 >>  8) & 0xFF); \
+		c0 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3(x0 & 0xFF); \
+		c0 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0(x1 >> 24); \
+		c1 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1((x1 >> 16) & 0xFF); \
+		c1 ^= tmp; \
+		tmp = mixtab2((x1 >>  8) & 0xFF); \
+		c1 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3(x1 & 0xFF); \
+		c1 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0(x2 >> 24); \
+		c2 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1((x2 >> 16) & 0xFF); \
+		c2 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2((x2 >>  8) & 0xFF); \
+		c2 ^= tmp; \
+		tmp = mixtab3(x2 & 0xFF); \
+		c2 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0(x3 >> 24); \
+		c3 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1((x3 >> 16) & 0xFF); \
+		c3 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2((x3 >>  8) & 0xFF); \
+		c3 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3(x3 & 0xFF); \
+		c3 ^= tmp; \
+		x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \
+			| ((c1 ^ r1) & SPH_C32(0x00FF0000)) \
+			| ((c2 ^ r2) & SPH_C32(0x0000FF00)) \
+			| ((c3 ^ r3) & SPH_C32(0x000000FF)); \
+		x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \
+			| ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \
+			| ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \
+			| ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \
+		x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \
+			| ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \
+			| ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \
+			| ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \
+		x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \
+			| ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \
+			| ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \
+			| ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \
+		/* */ \
+	}
+
+#define S00   (sc[ 0])
+#define S01   (sc[ 1])
+#define S02   (sc[ 2])
+#define S03   (sc[ 3])
+#define S04   (sc[ 4])
+#define S05   (sc[ 5])
+#define S06   (sc[ 6])
+#define S07   (sc[ 7])
+#define S08   (sc[ 8])
+#define S09   (sc[ 9])
+#define S10   (sc[10])
+#define S11   (sc[11])
+#define S12   (sc[12])
+#define S13   (sc[13])
+#define S14   (sc[14])
+#define S15   (sc[15])
+#define S16   (sc[16])
+#define S17   (sc[17])
+#define S18   (sc[18])
+#define S19   (sc[19])
+#define S20   (sc[20])
+#define S21   (sc[21])
+#define S22   (sc[22])
+#define S23   (sc[23])
+#define S24   (sc[24])
+#define S25   (sc[25])
+#define S26   (sc[26])
+#define S27   (sc[27])
+#define S28   (sc[28])
+#define S29   (sc[29])
+#define S30   (sc[30])
+#define S31   (sc[31])
+#define S32   (sc[32])
+#define S33   (sc[33])
+#define S34   (sc[34])
+#define S35   (sc[35])
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+/* GPU - FUNKTIONEN */
+
+__global__ void fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		/* Nimm den State und verarbeite das letztenByte (die Nounce) */
+		uint32_t sc[30];
+
+	#pragma unroll 30
+		for(int i=0;i<30;i++)
+			sc[i] = GPUstate[i];
+
+		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden	
+		uint32_t q;
+
+
+		// Bei Byte 80 laufen die Teilrunden: 4-0-1 (hier fest)
+
+		// Teilrunde 4
+		q = SWAB32(nounce);
+		TIX2(q, S06, S07, S14, S16, S00);
+		CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
+		SMIX(S03, S04, S05, S06);
+		CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
+		SMIX(S00, S01, S02, S03);
+
+		// Teilrunde 0
+		q = 0;
+		TIX2(q, S00, S01, S08, S10, S24);
+		CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
+		SMIX(S27, S28, S29, S00);
+		CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
+		SMIX(S24, S25, S26, S27);
+
+		// Teilrunde 1
+		q = 0x280; // hoffentlich richtig rum...
+		TIX2(q, S24, S25, S02, S04, S18);
+		CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
+		SMIX(S21, S22, S23, S24);
+		CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
+		SMIX(S18, S19, S20, S21);
+
+		// Rundenende
+		// rms = 12, d.h. 30 - 12 = 18
+
+	#pragma unroll 10
+		for(int i=0;i<10;i++)
+		{
+			//ROR(3, 30);
+			uint32_t tmp[3];
+			#pragma unroll 3
+			for(int k=0;k<3;k++)
+				tmp[k] = sc[27+k];
+			#pragma unroll 27
+			for(int k=26;k>=0;k--)
+				sc[k+3] = sc[k];
+			#pragma unroll 3
+			for(int k=0;k<3;k++)
+				sc[k] = tmp[k];
+
+
+			CMIX30(sc[18], sc[19], sc[20], sc[22], sc[23], sc[24], sc[3], sc[4], sc[5]);
+			SMIX(sc[18], sc[19], sc[20], sc[21]);
+		}
+
+	#pragma unroll 13
+		for(int i=0;i<13;i++)
+		{
+			sc[22] ^= sc[18];
+			sc[3] ^= sc[18];
+
+			// ROR(15, 30); BEGIN
+			uint32_t tmp1[15];
+			#pragma unroll 15
+			for(int k=0;k<15;k++)
+				tmp1[k] = sc[15+k];
+			#pragma unroll 15
+			for(int k=14;k>=0;k--)
+				sc[k+15] = sc[k];
+			#pragma unroll 15
+			for(int k=0;k<15;k++)
+				sc[k] = tmp1[k];
+			// ROR(15, 30); END
+
+			SMIX(sc[18], sc[19], sc[20], sc[21]);
+			sc[22] ^= sc[18];
+			sc[4] ^= sc[18];
+
+			// ROR(14, 30); BEGIN
+			uint32_t tmp2[14];
+			#pragma unroll 14
+			for(int k=0;k<14;k++)
+				tmp2[k] = sc[16+k];
+			#pragma unroll 16
+			for(int k=15;k>=0;k--)
+				sc[k+14] = sc[k];
+			#pragma unroll 14
+			for(int k=0;k<14;k++)
+				sc[k] = tmp2[k];
+			// ROR(14, 30); END
+
+			SMIX(sc[18], sc[19], sc[20], sc[21]);
+		}
+
+		sc[22] ^= sc[18];
+		sc[3] ^= sc[18];
+
+		/*
+		// SWAP32 und Daten ausgeben
+	#pragma unroll 4
+		for(int i=0;i<4;i++)
+			((uint32_t*)outputHash)[8*thread+i] = SWAB32(sc[19+i]);
+
+	#pragma unroll 4
+		for(int i=0;i<4;i++)
+			((uint32_t*)outputHash)[8*thread+i+4] = SWAB32(sc[3+i]);
+		*/
+		uint32_t hash[8];
+	#pragma unroll 4
+		for(int i=0;i<4;i++)
+			((uint32_t*)hash)[i] = SWAB32(sc[19+i]);
+
+	#pragma unroll 4
+		for(int i=0;i<4;i++)
+			((uint32_t*)hash)[i+4] = SWAB32(sc[3+i]);
+
+		int i;
+		bool rc = true;
+	
+		for (i = 7; i >= 0; i--) {
+			if (hash[i] > pTarget[i]) {
+				rc = false;
+				break;
+			}
+			if (hash[i] < pTarget[i]) {
+				rc = true;
+				break;
+			}
+		}
+
+		if(rc == true)
+		{
+			if(resNounce[0] > SWAB32(nounce))
+				resNounce[0] = SWAB32(nounce);
+		}
+	}
+}
+
+#define texDef(texname, texmem, texsource, texsize) \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+
+
+void fugue256_cpu_init(int thr_id, int threads)
+{
+	cudaSetDevice(thr_id);
+
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	/*
+	cudaMemcpyToSymbol(	mixtab0,
+						mixtab0_cpu,
+						sizeof(uint32_t) * 256 );
+	cudaMemcpyToSymbol(	mixtab1,
+						mixtab1_cpu,
+						sizeof(uint32_t) * 256 );
+	cudaMemcpyToSymbol(	mixtab2,
+						mixtab2_cpu,
+						sizeof(uint32_t) * 256 );
+	cudaMemcpyToSymbol(	mixtab3,
+						mixtab3_cpu,
+						sizeof(uint32_t) * 256 );
+	*/
+	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
+	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
+	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
+	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+}
+
+__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+{
+	// CPU-Vorbereitungen treffen
+	sph_fugue256_context ctx_fugue_const;
+	sph_fugue256_init(&ctx_fugue_const);
+	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
+
+	cudaMemcpyToSymbol(	GPUstate,
+						ctx_fugue_const.S,
+						sizeof(uint32_t) * 30 );
+
+	cudaMemcpyToSymbol(	pTarget,
+						pTargetIn,
+						sizeof(uint32_t) * 8 );
+
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+}
+
+__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce)
+{
+	const int threadsperblock = 512;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+	size_t shared_size = 0;
+
+	fugue256_gpu_hash<<<grid, block, shared_size>>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]);
+
+	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	MyStreamSynchronize(NULL, 0, thr_id);
+
+	//cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+}
+
+#endif
diff --git a/cuda_fugue256.h b/cuda_fugue256.h
new file mode 100644
index 0000000..bb864fb
--- /dev/null
+++ b/cuda_fugue256.h
@@ -0,0 +1,8 @@
+#ifndef _CUDA_FUGUE512_H
+#define _CUDA_FUGUE512_H
+
+void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce);
+void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
+void fugue256_cpu_init(int thr_id, int threads);
+
+#endif
diff --git a/cuda_groestl512.cu b/cuda_groestl512.cu
new file mode 100644
index 0000000..1c1dce9
--- /dev/null
+++ b/cuda_groestl512.cu
@@ -0,0 +1,837 @@
+/* Diese Funktion ist auf 84+32-Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define USE_SHARED 0
+#define W_ALIGNMENT 65
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// globaler Speicher f�r alle HeftyHashes aller Threads
+extern uint32_t *d_heftyHashes[8];
+extern uint32_t *d_nonceVector[8];
+
+// globaler Speicher f�r unsere Ergebnisse
+uint32_t *d_hash4output[8];
+
+__constant__ uint32_t groestl_gpu_state[32];
+__constant__ uint32_t groestl_gpu_msg[32];
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    ((x) & 0xFF)
+#define B32_1(x)    (((x) >> 8) & 0xFF)
+#define B32_2(x)    (((x) >> 16) & 0xFF)
+#define B32_3(x)    ((x) >> 24)
+
+#define SPH_C32(x)	((uint32_t)(x ## U))
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define T0up(x) tex1Dfetch(t0up, x)
+#define T0dn(x) tex1Dfetch(t0dn, x)
+#define T1up(x) tex1Dfetch(t1up, x)
+#define T1dn(x) tex1Dfetch(t1dn, x)
+#define T2up(x) tex1Dfetch(t2up, x)
+#define T2dn(x) tex1Dfetch(t2dn, x)
+#define T3up(x) tex1Dfetch(t3up, x)
+#define T3dn(x) tex1Dfetch(t3dn, x)
+
+texture<unsigned int, 1, cudaReadModeElementType> t0up;
+texture<unsigned int, 1, cudaReadModeElementType> t0dn;
+texture<unsigned int, 1, cudaReadModeElementType> t1up;
+texture<unsigned int, 1, cudaReadModeElementType> t1dn;
+texture<unsigned int, 1, cudaReadModeElementType> t2up;
+texture<unsigned int, 1, cudaReadModeElementType> t2dn;
+texture<unsigned int, 1, cudaReadModeElementType> t3up;
+texture<unsigned int, 1, cudaReadModeElementType> t3dn;
+
+static const uint32_t T0up_cpu[] = {
+	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
+	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
+	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
+	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
+	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
+	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
+	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
+	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
+	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
+	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
+	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
+	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
+	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
+	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
+	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
+	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
+	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
+	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
+	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
+	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
+	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
+	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
+	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
+	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
+	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
+	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
+	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
+	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
+	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
+	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
+	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
+	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
+	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
+	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
+	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
+	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
+	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
+	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
+	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
+	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
+	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
+	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
+	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
+	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
+	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
+	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
+	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
+	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
+	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
+	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
+	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
+	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
+	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
+	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
+	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
+	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
+	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
+	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
+	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
+	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
+	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
+	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
+	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
+	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
+};
+
+static const uint32_t T0dn_cpu[] = {
+	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
+	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
+	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
+	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
+	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
+	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
+	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
+	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
+	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
+	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
+	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
+	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
+	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
+	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
+	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
+	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
+	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
+	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
+	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
+	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
+	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
+	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
+	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
+	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
+	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
+	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
+	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
+	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
+	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
+	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
+	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
+	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
+	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
+	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
+	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
+	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
+	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
+	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
+	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
+	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
+	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
+	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
+	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
+	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
+	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
+	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
+	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
+	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
+	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
+	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
+	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
+	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
+	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
+	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
+	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
+	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
+	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
+	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
+	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
+	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
+	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
+	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
+	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
+	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
+};
+
+static const uint32_t T1up_cpu[] = {
+	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
+	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
+	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
+	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
+	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
+	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
+	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
+	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
+	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
+	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
+	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
+	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
+	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
+	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
+	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
+	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
+	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
+	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
+	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
+	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
+	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
+	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
+	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
+	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
+	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
+	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
+	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
+	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
+	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
+	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
+	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
+	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
+	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
+	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
+	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
+	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
+	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
+	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
+	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
+	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
+	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
+	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
+	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
+	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
+	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
+	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
+	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
+	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
+	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
+	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
+	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
+	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
+	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
+	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
+	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
+	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
+	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
+	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
+	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
+	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
+	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
+	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
+	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
+	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
+};
+
+static const uint32_t T1dn_cpu[] = {
+	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
+	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
+	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
+	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
+	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
+	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
+	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
+	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
+	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
+	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
+	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
+	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
+	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
+	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
+	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
+	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
+	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
+	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
+	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
+	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
+	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
+	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
+	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
+	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
+	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
+	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
+	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
+	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
+	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
+	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
+	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
+	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
+	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
+	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
+	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
+	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
+	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
+	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
+	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
+	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
+	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
+	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
+	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
+	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
+	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
+	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
+	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
+	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
+	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
+	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
+	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
+	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
+	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
+	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
+	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
+	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
+	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
+	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
+	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
+	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
+	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
+	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
+	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
+	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
+};
+
+static const uint32_t T2up_cpu[] = {
+	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
+	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
+	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
+	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
+	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
+	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
+	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
+	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
+	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
+	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
+	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
+	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
+	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
+	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
+	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
+	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
+	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
+	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
+	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
+	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
+	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
+	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
+	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
+	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
+	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
+	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
+	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
+	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
+	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
+	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
+	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
+	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
+	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
+	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
+	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
+	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
+	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
+	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
+	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
+	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
+	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
+	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
+	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
+	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
+	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
+	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
+	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
+	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
+	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
+	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
+	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
+	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
+	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
+	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
+	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
+	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
+	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
+	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
+	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
+	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
+	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
+	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
+	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
+	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
+};
+
+static const uint32_t T2dn_cpu[] = {
+	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
+	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
+	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
+	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
+	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
+	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
+	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
+	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
+	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
+	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
+	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
+	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
+	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
+	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
+	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
+	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
+	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
+	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
+	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
+	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
+	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
+	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
+	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
+	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
+	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
+	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
+	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
+	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
+	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
+	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
+	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
+	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
+	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
+	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
+	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
+	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
+	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
+	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
+	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
+	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
+	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
+	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
+	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
+	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
+	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
+	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
+	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
+	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
+	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
+	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
+	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
+	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
+	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
+	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
+	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
+	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
+	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
+	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
+	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
+	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
+	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
+	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
+	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
+	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
+};
+
+static const uint32_t T3up_cpu[] = {
+	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
+	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
+	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
+	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
+	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
+	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
+	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
+	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
+	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
+	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
+	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
+	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
+	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
+	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
+	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
+	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
+	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
+	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
+	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
+	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
+	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
+	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
+	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
+	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
+	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
+	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
+	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
+	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
+	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
+	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
+	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
+	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
+	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
+	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
+	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
+	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
+	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
+	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
+	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
+	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
+	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
+	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
+	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
+	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
+	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
+	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
+	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
+	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
+	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
+	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
+	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
+	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
+	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
+	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
+	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
+	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
+	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
+	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
+	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
+	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
+	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
+	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
+	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
+	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
+};
+
+static const uint32_t T3dn_cpu[] = {
+	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
+	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
+	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
+	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
+	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
+	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
+	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
+	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
+	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
+	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
+	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
+	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
+	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
+	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
+	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
+	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
+	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
+	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
+	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
+	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
+	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
+	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
+	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
+	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
+	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
+	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
+	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
+	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
+	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
+	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
+	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
+	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
+	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
+	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
+	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
+	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
+	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
+	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
+	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
+	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
+	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
+	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
+	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
+	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
+	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
+	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
+	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
+	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
+	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
+	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
+	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
+	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
+	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
+	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
+	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
+	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
+	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
+	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
+	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
+	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
+	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
+	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
+	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
+	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
+};
+
+__device__ void groestl512_perm_P(uint32_t *a)
+{
+	uint32_t t[32];
+
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+		{
+			a[(k*2)+0] ^= PC32up(k * 0x10, r);
+			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
+		}
+
+		// RBTT
+#pragma unroll 16
+		for(int k=0;k<32;k+=2)
+		{
+			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ 
+						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 23) & 0x1f]) );
+
+			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ 
+						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 23) & 0x1f]) );
+		}
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ void groestl512_perm_Q(uint32_t *a)
+{
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+		uint32_t t[32];
+
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+		{
+			a[(k*2)+0] ^= QC32up(k * 0x10, r);
+			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
+		}
+
+		// RBTT
+#pragma unroll 16
+		for(int k=0;k<32;k+=2)
+		{
+			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ 
+						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 13) & 0x1f]) );
+
+			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ 
+						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 13) & 0x1f]) );
+		}
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+			a[k] = t[k];
+	}
+}
+
+__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+#if USE_SHARED
+		extern __shared__ unsigned char s[];
+		uint32_t offset = W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x;
+		uint32_t *message = (uint32_t*)(&s[offset + 0]); // 128 Byte
+		uint32_t *state = (uint32_t*)(&s[offset + 128]); // 128 Byte
+#else
+		uint32_t message[32];
+		uint32_t state[32];
+#endif
+
+		// lese message ein & verkn�pfe diese mit dem hash1 von hefty1
+		// lese den state ein
+
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+		{
+			state[k] = groestl_gpu_state[k];
+			message[k] = groestl_gpu_msg[k];
+		}
+
+		uint32_t nounce = nonceVector[thread];
+		// nounce setzen
+		//message[19] = startNounce + thread;
+		message[19] = nounce;
+
+		uint32_t hashPosition = nounce - startNounce;
+
+		// den richtigen Hefty1 Hash holen
+//			memcpy(&message[21], &heftyHashes[8 * hashPosition], sizeof(uint32_t) * 8);
+		uint32_t *heftyHash = &heftyHashes[8 * hashPosition];
+#pragma unroll 8
+		for (int k=0; k<8; ++k)
+			message[21+k] = heftyHash[k];
+		
+		uint32_t g[32];
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			g[u] = message[u] ^ state[u];
+
+		// Perm
+		groestl512_perm_P(g);
+		groestl512_perm_Q(message);
+		
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+		{
+			state[u] ^= g[u] ^ message[u];
+			g[u] = state[u];
+		}
+
+		groestl512_perm_P(g);
+
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			state[u] ^= g[u];
+
+		// kopiere Ergebnis
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			((uint32_t*)outputHash)[16*hashPosition+k] = state[k + 16];
+	}
+}
+
+#define texDef(texname, texmem, texsource, texsize) \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+
+// Setup-Funktionen
+__host__ void groestl512_cpu_init(int thr_id, int threads)
+{
+	// Texturen mit obigem Makro initialisieren
+	texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
+}
+
+__host__ void groestl512_cpu_setBlock(void *data)
+	// data muss 84-Byte haben!
+	// heftyHash hat 32-Byte
+{
+	// Nachricht expandieren und setzen
+	uint32_t msgBlock[32];
+
+	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+	memcpy(&msgBlock[0], data, 84);
+
+	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
+	// Unsere Nachricht hat 116 Byte
+	msgBlock[29] = 0x80;
+	msgBlock[31] = 0x01000000;
+
+	// groestl512 braucht hierf�r keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgef�hrt)
+
+	// setze register 
+	uint32_t groestl_state_init[32];
+	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
+	groestl_state_init[31] = 0x20000;
+
+	// state speichern
+	cudaMemcpyToSymbol(	groestl_gpu_state,
+						groestl_state_init,
+						128);
+
+	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+	cudaMemcpyToSymbol(	groestl_gpu_msg,
+						msgBlock,
+						128);
+}
+
+__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+{
+	// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
+	if (copy)	
+		cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );		
+}
+
+__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+{
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+#if USE_SHARED
+	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+#else
+	size_t shared_size = 0;
+#endif
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	groestl512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+}
diff --git a/cuda_groestl512.h b/cuda_groestl512.h
new file mode 100644
index 0000000..bbeee40
--- /dev/null
+++ b/cuda_groestl512.h
@@ -0,0 +1,9 @@
+#ifndef _CUDA_GROESTL512_H
+#define _CUDA_GROESTL512_H
+
+void groestl512_cpu_init(int thr_id, int threads);
+void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+void groestl512_cpu_setBlock(void *data);
+void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+#endif
\ No newline at end of file
diff --git a/cuda_hefty1.cu b/cuda_hefty1.cu
new file mode 100644
index 0000000..c90d15d
--- /dev/null
+++ b/cuda_hefty1.cu
@@ -0,0 +1,401 @@
+/* Diese Funktion ist auf 84-Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define USE_SHARED 0
+#define W_ALIGNMENT 65
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+
+// globaler Speicher f�r alle HeftyHashes aller Threads
+uint32_t *d_heftyHashes[8];
+
+/* Hash-Tabellen */
+__constant__ uint32_t hefty_gpu_constantTable[64];
+
+// muss expandiert werden
+__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
+__constant__ uint32_t hefty_gpu_register[8];
+__constant__ uint32_t hefty_gpu_sponge[4];
+
+uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
+    0xbb67ae85UL,
+    0x3c6ef372UL,
+    0xa54ff53aUL,
+    0x510e527fUL,
+    0x9b05688cUL,
+    0x1f83d9abUL,
+    0x5be0cd19UL };
+uint32_t hefty_cpu_constantTable[] = {
+	0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
+    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
+    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
+    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
+    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
+    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
+    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
+    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
+    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
+    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
+    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
+    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
+
+#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+#define R(x, n)			((x) >> (n))
+#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+// uint8_t
+#define smoosh4(x)		( ((x)>>4) ^ ((x) & 0x0F) )
+__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
+{
+	uint16_t w = (x >> 16) ^ (x & 0xffff);
+	uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
+	return (n >> 2) ^ (n & 0x03);
+}
+// 4 auf einmal
+#define smoosh4Quad(x)	( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
+#define getByte(x,y)	( ((x) >> (y)) & 0xFF )
+
+__host__ __device__ void Mangle(uint32_t *inp)
+{
+	uint32_t r = smoosh4Quad(inp[0]);
+	//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
+	//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
+	//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
+	//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
+
+	inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
+	
+	switch (smoosh2(inp[1])) {
+      case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
+      case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
+      case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
+      case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
+    }
+    
+	uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
+    switch (tmp) {
+      case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
+      case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
+      case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
+      case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
+    }
+
+	inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
+}
+
+__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
+{
+	inp[0] ^= x;
+	Mangle(inp);
+}
+
+__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
+{
+	uint32_t y = inp[0];
+	Mangle(inp);
+	return y;
+}
+
+__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
+{
+	uint32_t r = Squeeze(sponge);
+
+	//uint8_t r0 = r >> 8;
+	uint8_t r1 = r & 0xFF;
+	uint32_t y = 1 << ((r >> 8) & 0x1F);
+
+	//uint32_t retVal;
+	//retVal = x;
+	
+	uint32_t resArr[4];
+	resArr[0] = x;
+	resArr[1] = x & ~y;
+	resArr[2] = x | y;
+	resArr[3] = x ^ y;
+	return resArr[r1 & 0x03];
+
+	/*
+	switch(r1 & 0x03)
+	{
+	case 0:
+		break;
+    case 1:
+        retVal = x & ~y;
+		break;
+    case 2:
+        retVal = x | y;
+		break;
+    case 3:
+        retVal = x ^ y;
+		break;
+    }
+	return retVal;
+	*/
+}
+
+__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+{
+	uint32_t tmpBr;
+
+	uint32_t brG = Br(sponge, regs[6]);	
+	uint32_t brF = Br(sponge, regs[5]);
+	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+	uint32_t brE = Br(sponge, regs[4]);
+	uint32_t tmp2 = tmp1 + S1(brE);
+	uint32_t brC = Br(sponge, regs[2]);
+	uint32_t brB = Br(sponge, regs[1]);
+	uint32_t brA = Br(sponge, regs[0]);
+	uint32_t tmp3 = Maj(brA, brB, brC);
+	tmpBr = Br(sponge, regs[0]);
+	uint32_t tmp4 = tmp3 + S0(tmpBr);
+	tmpBr = Br(sponge, tmp2);
+
+	#pragma unroll 7
+	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+	regs[0] = tmp2 + tmp4;
+	regs[4] += tmpBr;
+}
+
+__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+{
+	uint32_t tmpBr;
+
+	uint32_t brG = Br(sponge, regs[6]);	
+	uint32_t brF = Br(sponge, regs[5]);
+	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+	uint32_t brE = Br(sponge, regs[4]);
+	uint32_t tmp2 = tmp1 + S1(brE);
+	uint32_t brC = Br(sponge, regs[2]);
+	uint32_t brB = Br(sponge, regs[1]);
+	uint32_t brA = Br(sponge, regs[0]);
+	uint32_t tmp3 = Maj(brA, brB, brC);
+	tmpBr = Br(sponge, regs[0]);
+	uint32_t tmp4 = tmp3 + S0(tmpBr);
+	tmpBr = Br(sponge, tmp2);
+
+	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+	regs[0] = tmp2 + tmp4;
+	regs[4] += tmpBr;
+}
+
+// Die Hash-Funktion
+__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// bestimme den aktuellen Z�hler
+		uint32_t nounce = startNounce + thread;
+	
+		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
+#if USE_SHARED
+		extern __shared__ unsigned char s[];
+		uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
+#else
+		// reduktion von 256 byte auf 128 byte
+		uint32_t W1[16];
+		uint32_t W2[16];
+#endif
+
+		// Initialisiere die register a bis h mit der Hash-Tabelle
+		uint32_t regs[8];
+		uint32_t hash[8];
+		uint32_t sponge[4];
+	
+#pragma unroll 4
+		for(int k=0; k < 4; k++)
+			sponge[k] = hefty_gpu_sponge[k];
+
+		// pre
+#pragma unroll 8
+		for (int k=0; k < 8; k++)
+		{
+			regs[k] = hefty_gpu_register[k];
+			hash[k] = regs[k];
+		}
+	
+		//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			W1[k] = hefty_gpu_blockHeader[k];
+		W1[3] = SWAB32(nounce);
+
+
+		// 2. Runde
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+			Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
+
+// Progress W1 (Bytes 0...63)
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+		{
+			Absorb(sponge, regs[3] ^ regs[7]);
+			hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
+		}
+
+// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
+		
+#pragma unroll 3
+		for(int k=0;k<3;k++)
+		{
+	#pragma unroll 2
+			for(int j=0;j<2;j++)
+				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+	#pragma unroll 5
+			for(int j=2;j<7;j++)
+				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll 8
+			for(int j=7;j<15;j++)
+				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+	#pragma unroll 16
+			for(int j=0;j<16;j++)
+			{
+				Absorb(sponge, regs[3] + regs[7]);
+				hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
+			}
+	#pragma unroll 16
+			for(int j=0;j<16;j++)
+				W1[j] = W2[j];
+		}
+		
+
+#pragma unroll 8
+		for(int k=0;k<8;k++)
+			hash[k] += regs[k];
+
+#pragma unroll 8
+		for(int k=0;k<8;k++)
+			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
+	}
+}
+
+// Setup-Funktionen
+__host__ void hefty_cpu_init(int thr_id, int threads)
+{
+	cudaSetDevice(thr_id);
+
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol(	hefty_gpu_constantTable,
+						hefty_cpu_constantTable,
+						sizeof(uint32_t) * 64 );
+
+	// Speicher f�r alle Hefty1 hashes belegen
+	cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+}
+
+__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
+	// data muss 84-Byte haben!
+{
+	// Nachricht expandieren und setzen
+	uint32_t msgBlock[32];
+
+	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+	memcpy(&msgBlock[0], data, 84);
+	msgBlock[21] |= 0x80;
+	msgBlock[31] = 672; // bitlen
+	
+	for(int i=0;i<31;i++) // Byteorder drehen
+		msgBlock[i] = SWAB32(msgBlock[i]);
+
+	// die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
+	// alle Threads gleich ist. Der Hash wird dann an die Threads
+	// �bergeben
+
+	// Erstelle expandierten Block W
+	uint32_t W[64];	
+	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
+	for(int j=16;j<64;j++)
+		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
+
+	// Initialisiere die register a bis h mit der Hash-Tabelle
+	uint32_t regs[8];
+	uint32_t hash[8];
+	uint32_t sponge[4];
+
+	// pre
+	memset(sponge, 0, sizeof(uint32_t) * 4);
+    for (int k=0; k < 8; k++)
+	{
+		regs[k] = hefty_cpu_hashTable[k];
+		hash[k] = regs[k];
+	}	
+
+	// 1. Runde
+	for(int j=0;j<16;j++)
+		Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
+
+	for(int j=0;j<16;j++)
+	{
+		Absorb(sponge, regs[3] ^ regs[7]);
+		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+	}
+
+	for(int j=16;j<64;j++)
+	{
+		Absorb(sponge, regs[3] + regs[7]);
+		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+	}
+
+	for(int k=0;k<8;k++)
+		hash[k] += regs[k];
+
+	// sponge speichern
+
+	cudaMemcpyToSymbol(	hefty_gpu_sponge,
+						sponge,
+						sizeof(uint32_t) * 4 );
+	// hash speichern
+	cudaMemcpyToSymbol(	hefty_gpu_register,
+						hash,
+						sizeof(uint32_t) * 8 );
+
+	// Blockheader setzen (korrekte Nonce fehlt da drin noch)
+	cudaMemcpyToSymbol(	hefty_gpu_blockHeader,
+						&msgBlock[16],
+						64);
+}
+
+__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
+{
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+#if USE_SHARED
+	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+#else
+	size_t shared_size = 0;
+#endif
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+}
diff --git a/cuda_hefty1.h b/cuda_hefty1.h
new file mode 100644
index 0000000..08b1844
--- /dev/null
+++ b/cuda_hefty1.h
@@ -0,0 +1,8 @@
+#ifndef _CUDA_HEFTY1_H
+#define _CUDA_HEFTY1_H
+
+void hefty_cpu_hash(int thr_id, int threads, int startNounce);
+void hefty_cpu_setBlock(int thr_id, int threads, void *data);
+void hefty_cpu_init(int thr_id, int threads);
+
+#endif
\ No newline at end of file
diff --git a/cuda_keccak512.cu b/cuda_keccak512.cu
new file mode 100644
index 0000000..66dddaf
--- /dev/null
+++ b/cuda_keccak512.cu
@@ -0,0 +1,274 @@
+/* Diese Funktion ist auf 84+32-Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// globaler Speicher f�r alle HeftyHashes aller Threads
+extern uint32_t *d_heftyHashes[8];
+extern uint32_t *d_nonceVector[8];
+
+// globaler Speicher f�r unsere Ergebnisse
+uint32_t *d_hash3output[8];
+
+// der Keccak512 State nach der ersten Runde (72 Bytes)
+__constant__ uint64_t c_State[25];
+
+// die Message (72 Bytes) f�r die zweite Runde auf der GPU
+__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
+
+// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
+
+#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+
+#define U32TO64_LE(p) \
+    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+
+#define U64TO32_LE(p, v) \
+    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+
+static __device__ void mycpy72(uint32_t *d, const uint32_t *s) {
+#pragma unroll 18
+    for (int k=0; k < 18; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+    for (int k=0; k < 8; ++k) d[k] = s[k];
+}
+
+typedef struct keccak_hash_state_t {
+    uint64_t state[25];                        // 25*2
+    uint32_t buffer[72/4];                     // 72
+} keccak_hash_state;
+
+__device__ void statecopy(uint64_t *d, uint64_t *s)
+{
+#pragma unroll 25
+    for (int i=0; i < 25; ++i)
+        d[i] = s[i];
+}
+
+
+static const uint64_t host_keccak_round_constants[24] = {
+    0x0000000000000001ull, 0x0000000000008082ull,
+    0x800000000000808aull, 0x8000000080008000ull,
+    0x000000000000808bull, 0x0000000080000001ull,
+    0x8000000080008081ull, 0x8000000000008009ull,
+    0x000000000000008aull, 0x0000000000000088ull,
+    0x0000000080008009ull, 0x000000008000000aull,
+    0x000000008000808bull, 0x800000000000008bull,
+    0x8000000000008089ull, 0x8000000000008003ull,
+    0x8000000000008002ull, 0x8000000000000080ull,
+    0x000000000000800aull, 0x800000008000000aull,
+    0x8000000080008081ull, 0x8000000000008080ull,
+    0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint64_t c_keccak_round_constants[24];
+
+__host__ __device__ void
+keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
+    size_t i;
+    uint64_t t[5], u[5], v, w;
+
+    /* absorb input */
+#pragma unroll 9
+    for (i = 0; i < 72 / 8; i++, in += 2)
+        s[i] ^= U32TO64_LE(in);
+    
+    for (i = 0; i < 24; i++) {
+        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+        u[0] = t[4] ^ ROTL64(t[1], 1);
+        u[1] = t[0] ^ ROTL64(t[2], 1);
+        u[2] = t[1] ^ ROTL64(t[3], 1);
+        u[3] = t[2] ^ ROTL64(t[4], 1);
+        u[4] = t[3] ^ ROTL64(t[0], 1);
+
+        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+        /* rho pi: b[..] = rotl(a[..], ..) */
+        v = s[ 1];
+        s[ 1] = ROTL64(s[ 6], 44);
+        s[ 6] = ROTL64(s[ 9], 20);
+        s[ 9] = ROTL64(s[22], 61);
+        s[22] = ROTL64(s[14], 39);
+        s[14] = ROTL64(s[20], 18);
+        s[20] = ROTL64(s[ 2], 62);
+        s[ 2] = ROTL64(s[12], 43);
+        s[12] = ROTL64(s[13], 25);
+        s[13] = ROTL64(s[19],  8);
+        s[19] = ROTL64(s[23], 56);
+        s[23] = ROTL64(s[15], 41);
+        s[15] = ROTL64(s[ 4], 27);
+        s[ 4] = ROTL64(s[24], 14);
+        s[24] = ROTL64(s[21],  2);
+        s[21] = ROTL64(s[ 8], 55);
+        s[ 8] = ROTL64(s[16], 45);
+        s[16] = ROTL64(s[ 5], 36);
+        s[ 5] = ROTL64(s[ 3], 28);
+        s[ 3] = ROTL64(s[18], 21);
+        s[18] = ROTL64(s[17], 15);
+        s[17] = ROTL64(s[11], 10);
+        s[11] = ROTL64(s[ 7],  6);
+        s[ 7] = ROTL64(s[10],  3);
+        s[10] = ROTL64(    v,  1);
+
+        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+        /* iota: a[0,0] ^= round constant */
+        s[0] ^= keccak_round_constants[i];
+    }
+}
+
+// Die Hash-Funktion
+__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// bestimme den aktuellen Z�hler
+		//uint32_t nounce = startNounce + thread;
+		uint32_t nounce = nonceVector[thread];
+
+		// Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash)
+		uint32_t hashPosition = nounce - startNounce;
+
+		// erstmal den State der ersten Runde holen
+		uint64_t keccak_gpu_state[25];
+#pragma unroll 25
+		for (int i=0; i < 25; ++i)
+			keccak_gpu_state[i] = c_State[i];
+	
+		// Message2 in den Puffer holen
+		uint32_t msgBlock[18];
+		mycpy72(msgBlock, c_PaddedMessage2);
+
+		// die individuelle Nonce einsetzen
+		msgBlock[1] = nounce;
+
+		// den individuellen Hefty1 Hash einsetzen
+		mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]);
+
+		// den Block einmal gut durchsch�tteln
+		keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
+
+		// das Hash erzeugen
+		uint32_t hash[16];
+
+#pragma unroll 8
+		for (size_t i = 0; i < 64; i += 8) {
+			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
+		}
+
+
+		// und ins Global Memory rausschreiben
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			((uint32_t*)outputHash)[16*hashPosition+k] = hash[k];
+	}
+}
+
+// ---------------------------- END CUDA keccak512 functions ------------------------------------
+
+// Setup-Funktionen
+__host__ void keccak512_cpu_init(int thr_id, int threads)
+{
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol( c_keccak_round_constants,
+						host_keccak_round_constants,
+						sizeof(host_keccak_round_constants),
+						0, cudaMemcpyHostToDevice);
+
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
+}
+
+// ----------------BEGIN keccak512 CPU version from scrypt-jane code --------------------
+
+#define SCRYPT_HASH_DIGEST_SIZE 64
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
+#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
+#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) /* 72 */
+
+// --------------- END keccak512 CPU version from scrypt-jane code --------------------
+
+__host__ void keccak512_cpu_setBlock(void *data)
+	// data muss 84-Byte haben!
+	// heftyHash hat 32-Byte
+{
+	// CH
+	// state init	
+	uint64_t keccak_cpu_state[25];
+	memset(keccak_cpu_state, 0, 200);
+
+	// keccak hat 72-Byte bl�cke, d.h. in unserem Fall zwei Bl�cke
+	// zu jeweils 
+	uint32_t msgBlock[18];
+	memset(msgBlock, 0, 18 * sizeof(uint32_t));
+
+	// kopiere die Daten rein (aber nur alles nach Bit 72)
+	memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12);
+
+	// Nachricht abschlie�en
+	msgBlock[11] = 0x01;
+	msgBlock[17] = 0x80000000;
+	
+	// erste Runde	
+	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
+
+	// Message 2 ins Constant Memory kopieren (die variable Nonce und 
+	// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
+	cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
+
+	// state kopieren
+	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+{
+	// Hefty1 Hashes kopieren
+	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	//else cudaThreadSynchronize();
+}
+
+__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+{
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	size_t shared_size = 0;
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+}
diff --git a/cuda_keccak512.h b/cuda_keccak512.h
new file mode 100644
index 0000000..abd4741
--- /dev/null
+++ b/cuda_keccak512.h
@@ -0,0 +1,9 @@
+#ifndef _CUDA_KECCAK512_H
+#define _CUDA_KECCAK512_H
+
+void keccak512_cpu_init(int thr_id, int threads);
+void keccak512_cpu_setBlock(void *data);
+void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+#endif
diff --git a/cuda_sha256.cu b/cuda_sha256.cu
new file mode 100644
index 0000000..a68f849
--- /dev/null
+++ b/cuda_sha256.cu
@@ -0,0 +1,274 @@
+/* Diese Funktion ist auf 84+32 Byte gro�e Eingabedaten ausgerichtet (Heavycoin) */
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define W_ALIGNMENT 65
+
+// Folgende Definitionen sp�ter durch header ersetzen
+typedef unsigned int uint32_t;
+
+// globaler Speicher f�r alle HeftyHashes aller Threads
+extern uint32_t *d_heftyHashes[8];
+extern uint32_t *d_nonceVector[8];
+
+// globaler Speicher f�r unsere Ergebnisse
+uint32_t *d_hash2output[8];
+
+
+/* Hash-Tabellen */
+__constant__ uint32_t sha256_gpu_constantTable[64];
+
+// muss expandiert werden
+__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message
+__constant__ uint32_t sha256_gpu_register[8];
+
+uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+uint32_t sha256_cpu_constantTable[] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+#define R(x, n)			((x) >> (n))
+#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+// Die Hash-Funktion
+__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// bestimme den aktuellen Z�hler
+		uint32_t nounce = startNounce + thread;
+		nonceVector[thread] = nounce;
+	
+		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
+		//extern __shared__ unsigned char s[];
+		//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
+		uint32_t W1[16];
+		uint32_t W2[16];
+
+		// Initialisiere die register a bis h mit der Hash-Tabelle
+		uint32_t regs[8];
+		uint32_t hash[8];
+
+		// pre
+#pragma unroll 8
+		for (int k=0; k < 8; k++)
+		{
+			regs[k] = sha256_gpu_register[k];
+			hash[k] = regs[k];
+		}
+	
+		// 2. Runde
+		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilbl�cke
+		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen		
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			W1[k] = sha256_gpu_blockHeader[k];
+
+		uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
+#pragma unroll 8
+		for(int k=0;k<8;k++)
+			W1[5+k] = heftyHashes[offset + k];
+
+
+#pragma unroll 8
+		for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
+		W1[3] = SWAB32(nounce);
+
+// Progress W1
+#pragma unroll 16
+		for(int j=0;j<16;j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+			#pragma unroll 7
+			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+// Progress W2...W3
+#pragma unroll 3
+		for(int k=0;k<3;k++)
+		{
+	#pragma unroll 2
+			for(int j=0;j<2;j++)
+				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+	#pragma unroll 5
+			for(int j=2;j<7;j++)
+				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll 8
+			for(int j=7;j<15;j++)
+				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+			// Rundenfunktion
+	#pragma unroll 16
+			for(int j=0;j<16;j++)
+			{
+				uint32_t T1, T2;
+				T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+				T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+				#pragma unroll 7
+				for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+				regs[0] = T1 + T2;
+				regs[4] += T1;
+			}
+
+	#pragma unroll 16
+			for(int j=0;j<16;j++)
+				W1[j] = W2[j];
+		}
+
+/*
+		for(int j=16;j<64;j++)
+			W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
+	
+#pragma unroll 64
+		for(int j=0;j<64;j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+			#pragma unroll 7
+			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+*/
+#pragma unroll 8
+		for(int k=0;k<8;k++)
+			hash[k] += regs[k];
+
+#pragma unroll 8
+		for(int k=0;k<8;k++)
+			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
+	}
+}
+
+// Setup-Funktionen
+__host__ void sha256_cpu_init(int thr_id, int threads)
+{
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
+						sha256_cpu_constantTable,
+						sizeof(uint32_t) * 64 );
+
+	// Speicher f�r alle Ergebnisse belegen
+	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
+}
+
+__host__ void sha256_cpu_setBlock(void *data)
+	// data muss 84-Byte haben!
+	// heftyHash hat 32-Byte
+{
+	// Nachricht expandieren und setzen
+	uint32_t msgBlock[32];
+
+	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+	memcpy(&msgBlock[0], data, 84);
+	memset(&msgBlock[21], 0, 32); // vorl�ufig  Nullen anstatt der Hefty1 Hashes einf�llen
+	msgBlock[29] |= 0x80;
+	msgBlock[31] = 928; // bitlen
+	
+	for(int i=0;i<31;i++) // Byteorder drehen
+		msgBlock[i] = SWAB32(msgBlock[i]);
+
+	// die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
+	// alle Threads gleich ist. Der Hash wird dann an die Threads
+	// �bergeben
+	uint32_t W[64];
+
+	// Erstelle expandierten Block W
+	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
+	for(int j=16;j<64;j++)
+		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
+
+	// Initialisiere die register a bis h mit der Hash-Tabelle
+	uint32_t regs[8];
+	uint32_t hash[8];
+
+	// pre
+    for (int k=0; k < 8; k++)
+	{
+		regs[k] = sha256_cpu_hashTable[k];
+		hash[k] = regs[k];
+	}
+
+	// 1. Runde
+	for(int j=0;j<64;j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		
+		//#pragma unroll 7
+		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+		// sollte mal noch durch memmov ersetzt werden!
+//		memcpy(&regs[1], &regs[0], sizeof(uint32_t) * 7);
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	for(int k=0;k<8;k++)
+		hash[k] += regs[k];
+
+	// hash speichern
+	cudaMemcpyToSymbol(	sha256_gpu_register,
+						hash,
+						sizeof(uint32_t) * 8 );
+
+	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+	cudaMemcpyToSymbol(	sha256_gpu_blockHeader,
+						&msgBlock[16],
+						64);
+}
+
+__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+{
+	// Hefty1 Hashes kopieren
+	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	//else cudaThreadSynchronize();
+}
+
+__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
+{
+	const int threadsperblock = 128;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Gr��e des dynamischen Shared Memory Bereichs (abh�ngig von der Threadanzahl)
+	//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingef�gt gegen Bank Konflikte
+	size_t shared_size = 0;
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	sha256_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+}
diff --git a/cuda_sha256.h b/cuda_sha256.h
new file mode 100644
index 0000000..ff03bf5
--- /dev/null
+++ b/cuda_sha256.h
@@ -0,0 +1,8 @@
+#ifndef _CUDA_SHA256_H
+#define _CUDA_SHA256_H
+
+void sha256_cpu_init(int thr_id, int threads);
+void sha256_cpu_setBlock(void *data);
+void sha256_cpu_hash(int thr_id, int threads, int startNounce);
+void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+#endif
diff --git a/depcomp b/depcomp
new file mode 100644
index 0000000..bd0ac08
--- /dev/null
+++ b/depcomp
@@ -0,0 +1,688 @@
+#! /bin/sh
+# depcomp - compile a program generating dependencies as side-effects
+
+scriptversion=2011-12-04.11; # UTC
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009, 2010,
+# 2011 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+case $1 in
+  '')
+     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: depcomp [--help] [--version] PROGRAM [ARGS]
+
+Run PROGRAMS ARGS to compile a file, generating dependencies
+as side-effects.
+
+Environment variables:
+  depmode     Dependency tracking mode.
+  source      Source file read by `PROGRAMS ARGS'.
+  object      Object file output by `PROGRAMS ARGS'.
+  DEPDIR      directory where to store dependencies.
+  depfile     Dependency file to output.
+  tmpdepfile  Temporary file to use when outputting dependencies.
+  libtool     Whether libtool is used (yes/no).
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "depcomp $scriptversion"
+    exit $?
+    ;;
+esac
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+
+# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
+depfile=${depfile-`echo "$object" |
+  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+cygpath_u="cygpath -u -f -"
+if test "$depmode" = msvcmsys; then
+   # This is just like msvisualcpp but w/o cygpath translation.
+   # Just convert the backslash-escaped backslashes to single forward
+   # slashes to satisfy depend.m4
+   cygpath_u='sed s,\\\\,/,g'
+   depmode=msvisualcpp
+fi
+
+if test "$depmode" = msvc7msys; then
+   # This is just like msvc7 but w/o cygpath translation.
+   # Just convert the backslash-escaped backslashes to single forward
+   # slashes to satisfy depend.m4
+   cygpath_u='sed s,\\\\,/,g'
+   depmode=msvc7
+fi
+
+case "$depmode" in
+gcc3)
+## gcc 3 implements dependency tracking that does exactly what
+## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
+## it if -MD -MP comes after the -MF stuff.  Hmm.
+## Unfortunately, FreeBSD c89 acceptance of flags depends upon
+## the command line argument order; so add the flags where they
+## appear in depend2.am.  Note that the slowdown incurred here
+## affects only configure: in makefiles, %FASTDEP% shortcuts this.
+  for arg
+  do
+    case $arg in
+    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
+    *)  set fnord "$@" "$arg" ;;
+    esac
+    shift # fnord
+    shift # $arg
+  done
+  "$@"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  mv "$tmpdepfile" "$depfile"
+  ;;
+
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  "$@" -Wp,"$gccflag$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+## The second -e expression handles DOS-style file names with drive letters.
+  sed -e 's/^[^:]*: / /' \
+      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the `deleted header file' problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' '
+' < "$tmpdepfile" |
+## Some versions of gcc put a space before the `:'.  On the theory
+## that the space means something, we add a space to the output as
+## well.  hp depmode also adds that space, but also prefixes the VPATH
+## to the object.  Take care to not repeat it in the output.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
+      | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wp,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+
+  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
+    echo "$object : \\" > "$depfile"
+
+    # Clip off the initial element (the dependent).  Don't try to be
+    # clever and replace this with sed code, as IRIX sed won't handle
+    # lines with more than a fixed number of characters (4096 in
+    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
+    # the IRIX cc adds comments like `#:fec' to the end of the
+    # dependency line.
+    tr ' ' '
+' < "$tmpdepfile" \
+    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
+    tr '
+' ' ' >> "$depfile"
+    echo >> "$depfile"
+
+    # The second pass generates a dummy entry for each header file.
+    tr ' ' '
+' < "$tmpdepfile" \
+   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
+   >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+aix)
+  # The C for AIX Compiler uses -M and outputs the dependencies
+  # in a .u file.  In older versions, this file always lives in the
+  # current directory.  Also, the AIX compiler puts `$object:' at the
+  # start of each line; $object doesn't have directory information.
+  # Version 6 uses the directory in both cases.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$base.u
+    tmpdepfile3=$dir.libs/$base.u
+    "$@" -Wc,-M
+  else
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$dir$base.u
+    tmpdepfile3=$dir$base.u
+    "$@" -M
+  fi
+  stat=$?
+
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+    exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    # Each line is of the form `foo.o: dependent.h'.
+    # Do two passes, one to just change these to
+    # `$object: dependent.h' and one to simply `dependent.h:'.
+    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+    # That's a tab and a space in the [].
+    sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+icc)
+  # Intel's C compiler understands `-MD -MF file'.  However on
+  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
+  # ICC 7.0 will fill foo.d with something like
+  #    foo.o: sub/foo.c
+  #    foo.o: sub/foo.h
+  # which is wrong.  We want:
+  #    sub/foo.o: sub/foo.c
+  #    sub/foo.o: sub/foo.h
+  #    sub/foo.c:
+  #    sub/foo.h:
+  # ICC 7.1 will output
+  #    foo.o: sub/foo.c sub/foo.h
+  # and will wrap long lines using \ :
+  #    foo.o: sub/foo.c ... \
+  #     sub/foo.h ... \
+  #     ...
+
+  "$@" -MD -MF "$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  # Each line is of the form `foo.o: dependent.h',
+  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
+  # Do two passes, one to just change these to
+  # `$object: dependent.h' and one to simply `dependent.h:'.
+  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process this invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
+    sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp2)
+  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
+  # compilers, which have integrated preprocessors.  The correct option
+  # to use with these is +Maked; it writes dependencies to a file named
+  # 'foo.d', which lands next to the object file, wherever that
+  # happens to be.
+  # Much of this is similar to the tru64 case; see comments there.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir.libs/$base.d
+    "$@" -Wc,+Maked
+  else
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir$base.d
+    "$@" +Maked
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+     rm -f "$tmpdepfile1" "$tmpdepfile2"
+     exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
+    # Add `dependent.h:' lines.
+    sed -ne '2,${
+	       s/^ *//
+	       s/ \\*$//
+	       s/$/:/
+	       p
+	     }' "$tmpdepfile" >> "$depfile"
+  else
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile" "$tmpdepfile2"
+  ;;
+
+tru64)
+   # The Tru64 compiler uses -MD to generate dependencies as a side
+   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
+   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
+   # dependencies in `foo.d' instead, so we check for that too.
+   # Subdirectories are respected.
+   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+   test "x$dir" = "x$object" && dir=
+   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+
+   if test "$libtool" = yes; then
+      # With Tru64 cc, shared objects can also be used to make a
+      # static library.  This mechanism is used in libtool 1.4 series to
+      # handle both shared and static libraries in a single compilation.
+      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
+      #
+      # With libtool 1.5 this exception was removed, and libtool now
+      # generates 2 separate objects for the 2 libraries.  These two
+      # compilations output dependencies in $dir.libs/$base.o.d and
+      # in $dir$base.o.d.  We have to check for both files, because
+      # one of the two compilations can be disabled.  We should prefer
+      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
+      # automatically cleaned when .libs/ is deleted, while ignoring
+      # the former would cause a distcleancheck panic.
+      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
+      tmpdepfile2=$dir$base.o.d          # libtool 1.5
+      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
+      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
+      "$@" -Wc,-MD
+   else
+      tmpdepfile1=$dir$base.o.d
+      tmpdepfile2=$dir$base.d
+      tmpdepfile3=$dir$base.d
+      tmpdepfile4=$dir$base.d
+      "$@" -MD
+   fi
+
+   stat=$?
+   if test $stat -eq 0; then :
+   else
+      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+      exit $stat
+   fi
+
+   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+   do
+     test -f "$tmpdepfile" && break
+   done
+   if test -f "$tmpdepfile"; then
+      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+      # That's a tab and a space in the [].
+      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+   else
+      echo "#dummy" > "$depfile"
+   fi
+   rm -f "$tmpdepfile"
+   ;;
+
+msvc7)
+  if test "$libtool" = yes; then
+    showIncludes=-Wc,-showIncludes
+  else
+    showIncludes=-showIncludes
+  fi
+  "$@" $showIncludes > "$tmpdepfile"
+  stat=$?
+  grep -v '^Note: including file: ' "$tmpdepfile"
+  if test "$stat" = 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  # The first sed program below extracts the file names and escapes
+  # backslashes for cygpath.  The second sed program outputs the file
+  # name when reading, but also accumulates all include files in the
+  # hold buffer in order to output them again at the end.  This only
+  # works with sed implementations that can handle large buffers.
+  sed < "$tmpdepfile" -n '
+/^Note: including file:  *\(.*\)/ {
+  s//\1/
+  s/\\/\\\\/g
+  p
+}' | $cygpath_u | sort -u | sed -n '
+s/ /\\ /g
+s/\(.*\)/	\1 \\/p
+s/.\(.*\) \\/\1:/
+H
+$ {
+  s/.*/	/
+  G
+  p
+}' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvc7msys)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  test -z "$dashmflag" && dashmflag=-M
+  # Require at least two characters before searching for `:'
+  # in the target name.  This is to cope with DOS-style filenames:
+  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
+  "$@" $dashmflag |
+    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  "$@" || exit $?
+  # Remove any Libtool call
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+  # X makedepend
+  shift
+  cleared=no eat=no
+  for arg
+  do
+    case $cleared in
+    no)
+      set ""; shift
+      cleared=yes ;;
+    esac
+    if test $eat = yes; then
+      eat=no
+      continue
+    fi
+    case "$arg" in
+    -D*|-I*)
+      set fnord "$@" "$arg"; shift ;;
+    # Strip any option that makedepend may not understand.  Remove
+    # the object too, otherwise makedepend will parse it as a source file.
+    -arch)
+      eat=yes ;;
+    -*|$object)
+      ;;
+    *)
+      set fnord "$@" "$arg"; shift ;;
+    esac
+  done
+  obj_suffix=`echo "$object" | sed 's/^.*\././'`
+  touch "$tmpdepfile"
+  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  rm -f "$depfile"
+  # makedepend may prepend the VPATH from the source file name to the object.
+  # No need to regex-escape $object, excess matching of '.' is harmless.
+  sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
+  sed '1,2d' "$tmpdepfile" | tr ' ' '
+' | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  "$@" -E |
+    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
+    sed '$ s: \\$::' > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  cat < "$tmpdepfile" >> "$depfile"
+  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvisualcpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test "X$1" != 'X--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  IFS=" "
+  for arg
+  do
+    case "$arg" in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
+	set fnord "$@"
+	shift
+	shift
+	;;
+    *)
+	set fnord "$@" "$arg"
+	shift
+	shift
+	;;
+    esac
+  done
+  "$@" -E 2>/dev/null |
+  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
+  echo "	" >> "$depfile"
+  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvcmsys)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/elist.h b/elist.h
new file mode 100644
index 0000000..431472f
--- /dev/null
+++ b/elist.h
@@ -0,0 +1,251 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline void __list_add(struct list_head *lnew,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = lnew;
+	lnew->next = next;
+	lnew->prev = prev;
+	prev->next = lnew;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline void list_add(struct list_head *lnew, struct list_head *head)
+{
+	__list_add(lnew, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline void list_add_tail(struct list_head *lnew, struct list_head *head)
+{
+	__list_add(lnew, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static __inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = (struct list_head *) 0;
+	entry->prev = (struct list_head *) 0;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static __inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static __inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static __inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static __inline int list_empty(struct list_head *head)
+{
+	return head->next == head;
+}
+
+static __inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static __inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static __inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); \
+        	pos = pos->next)
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); \
+        	pos = pos->prev)
+
+/**
+ * list_for_each_safe	-	iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member, tpos)				\
+	for (pos = list_entry((head)->next, tpos, member);	\
+	     &pos->member != (head); 					\
+	     pos = list_entry(pos->member.next, tpos, member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member, tpos, tn)			\
+	for (pos = list_entry((head)->next, tpos, member),	\
+		n = list_entry(pos->member.next, tpos, member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, tn, member))
+
+/**
+ * list_for_each_entry_continue -       iterate over list of given type
+ *                      continuing after existing point
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member, tpos)			\
+	for (pos = list_entry(pos->member.next, tpos, member),	\
+		     prefetch(pos->member.next);			\
+	     &pos->member != (head);					\
+	     pos = list_entry(pos->member.next, tpos, member),	\
+		     prefetch(pos->member.next))
+
+#endif
diff --git a/files.txt b/files.txt
new file mode 100644
index 0000000..2b7db9d
--- /dev/null
+++ b/files.txt
@@ -0,0 +1,30 @@
+blake512.cu
+blake.c
+combine.cu
+compat.h
+cpu-miner.c
+cpuminer-config.h
+cuda_blake512.h
+cuda_combine.h
+cuda_groestl512.h
+cuda_hefty1.h
+cuda_keccak512.h
+cuda_sha256.h
+elist.h
+groestl512.cu
+groestl.c
+heavy.c
+hefty1.c
+hefty1.cu
+hefty1.h
+keccak512.cu
+keccak.c
+miner.h
+scrypt.c
+sha256.cu
+sha2.c
+sph_blake.h
+sph_groestl.h
+sph_keccak.h
+sph_types.h
+util.c
diff --git a/fugue.c b/fugue.c
new file mode 100644
index 0000000..85767c9
--- /dev/null
+++ b/fugue.c
@@ -0,0 +1,1208 @@
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_fugue.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
+	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
+	SPH_C32(0xbd8d679a)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xe952bdde), SPH_C32(0x6671135f), SPH_C32(0xe0d4f668),
+	SPH_C32(0xd2b0b594), SPH_C32(0xf96c621d), SPH_C32(0xfbf929de),
+	SPH_C32(0x9149e899), SPH_C32(0x34f8c248)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xaa61ec0d), SPH_C32(0x31252e1f), SPH_C32(0xa01db4c7),
+	SPH_C32(0x00600985), SPH_C32(0x215ef44a), SPH_C32(0x741b5e9c),
+	SPH_C32(0xfa693e9a), SPH_C32(0x473eb040), SPH_C32(0xe502ae8a),
+	SPH_C32(0xa99c25e0), SPH_C32(0xbc95517c), SPH_C32(0x5c1095a1)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x8807a57e), SPH_C32(0xe616af75), SPH_C32(0xc5d3e4db),
+	SPH_C32(0xac9ab027), SPH_C32(0xd915f117), SPH_C32(0xb6eecc54),
+	SPH_C32(0x06e8020b), SPH_C32(0x4a92efd1), SPH_C32(0xaac6e2c9),
+	SPH_C32(0xddb21398), SPH_C32(0xcae65838), SPH_C32(0x437f203f),
+	SPH_C32(0x25ea78e7), SPH_C32(0x951fddd6), SPH_C32(0xda6ed11d),
+	SPH_C32(0xe13e3567)
+};
+
+static const sph_u32 mixtab0[] = {
+	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
+	SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7),
+	SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0),
+	SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac),
+	SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a),
+	SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e),
+	SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5),
+	SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed),
+	SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be),
+	SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6),
+	SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea),
+	SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298),
+	SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1),
+	SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2),
+	SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf),
+	SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54),
+	SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c),
+	SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e),
+	SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c),
+	SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5),
+	SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe),
+	SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a),
+	SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c),
+	SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6),
+	SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475),
+	SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1),
+	SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257),
+	SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599),
+	SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2),
+	SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701),
+	SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33),
+	SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11),
+	SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e),
+	SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f),
+	SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f),
+	SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7),
+	SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a),
+	SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba),
+	SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e),
+	SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9),
+	SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045),
+	SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1),
+	SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519),
+	SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d),
+	SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b),
+	SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa),
+	SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b),
+	SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7),
+	SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627),
+	SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8),
+	SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903),
+	SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50),
+	SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c),
+	SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8),
+	SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f),
+	SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b),
+	SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986),
+	SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662),
+	SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1),
+	SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af),
+	SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123),
+	SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943),
+	SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f),
+	SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820),
+	SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94),
+	SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae),
+	SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d),
+	SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c),
+	SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a),
+	SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8),
+	SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b),
+	SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38),
+	SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47),
+	SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129),
+	SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9),
+	SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744),
+	SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e),
+	SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78),
+	SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15),
+	SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51),
+	SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212),
+	SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5),
+	SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f),
+	SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c),
+	SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda),
+	SPH_C32(0x16166258)
+};
+
+static const sph_u32 mixtab1[] = {
+	SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e),
+	SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a),
+	SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090),
+	SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1),
+	SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c),
+	SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3),
+	SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0),
+	SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6),
+	SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243),
+	SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451),
+	SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728),
+	SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2),
+	SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3),
+	SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556),
+	SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c),
+	SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b),
+	SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9),
+	SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf),
+	SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715),
+	SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298),
+	SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233),
+	SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4),
+	SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41),
+	SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d),
+	SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614),
+	SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f),
+	SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2),
+	SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5),
+	SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a),
+	SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47),
+	SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed),
+	SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b),
+	SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b),
+	SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8),
+	SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0),
+	SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66),
+	SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0),
+	SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344),
+	SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3),
+	SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd),
+	SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30),
+	SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb),
+	SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55),
+	SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2),
+	SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7),
+	SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758),
+	SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427),
+	SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342),
+	SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6),
+	SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6),
+	SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9),
+	SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c),
+	SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31),
+	SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e),
+	SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4),
+	SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f),
+	SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69),
+	SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6),
+	SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783),
+	SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18),
+	SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1),
+	SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9),
+	SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520),
+	SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838),
+	SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb),
+	SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f),
+	SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae),
+	SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d),
+	SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c),
+	SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba),
+	SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3),
+	SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a),
+	SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be),
+	SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171),
+	SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191),
+	SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177),
+	SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87),
+	SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a),
+	SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c),
+	SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b),
+	SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992),
+	SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684),
+	SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc),
+	SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d),
+	SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c),
+	SPH_C32(0x58161662)
+};
+
+static const sph_u32 mixtab2[] = {
+	SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777),
+	SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b),
+	SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030),
+	SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b),
+	SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab),
+	SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282),
+	SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa),
+	SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0),
+	SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2),
+	SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4),
+	SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7),
+	SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626),
+	SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7),
+	SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5),
+	SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171),
+	SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515),
+	SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323),
+	SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696),
+	SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707),
+	SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2),
+	SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2),
+	SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383),
+	SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b),
+	SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0),
+	SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6),
+	SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3),
+	SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353),
+	SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded),
+	SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1),
+	SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb),
+	SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a),
+	SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf),
+	SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa),
+	SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d),
+	SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545),
+	SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f),
+	SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f),
+	SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3),
+	SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292),
+	SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5),
+	SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada),
+	SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff),
+	SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd),
+	SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec),
+	SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444),
+	SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7),
+	SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464),
+	SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373),
+	SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f),
+	SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a),
+	SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646),
+	SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414),
+	SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b),
+	SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232),
+	SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949),
+	SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c),
+	SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac),
+	SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595),
+	SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7),
+	SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d),
+	SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e),
+	SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656),
+	SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565),
+	SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808),
+	SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525),
+	SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6),
+	SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8),
+	SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f),
+	SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b),
+	SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e),
+	SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848),
+	SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e),
+	SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757),
+	SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1),
+	SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1),
+	SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111),
+	SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e),
+	SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e),
+	SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece),
+	SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf),
+	SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989),
+	SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6),
+	SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141),
+	SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f),
+	SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb),
+	SPH_C32(0x62581616)
+};
+
+static const sph_u32 mixtab3[] = {
+	SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777),
+	SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b),
+	SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030),
+	SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b),
+	SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab),
+	SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82),
+	SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa),
+	SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0),
+	SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2),
+	SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4),
+	SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7),
+	SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826),
+	SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7),
+	SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5),
+	SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71),
+	SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415),
+	SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23),
+	SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96),
+	SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07),
+	SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2),
+	SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2),
+	SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83),
+	SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b),
+	SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0),
+	SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6),
+	SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3),
+	SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753),
+	SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed),
+	SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1),
+	SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb),
+	SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a),
+	SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf),
+	SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa),
+	SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d),
+	SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45),
+	SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f),
+	SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f),
+	SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3),
+	SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92),
+	SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5),
+	SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da),
+	SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff),
+	SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd),
+	SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec),
+	SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44),
+	SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7),
+	SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64),
+	SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773),
+	SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f),
+	SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a),
+	SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346),
+	SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014),
+	SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b),
+	SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832),
+	SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49),
+	SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c),
+	SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac),
+	SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295),
+	SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7),
+	SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d),
+	SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e),
+	SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356),
+	SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65),
+	SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008),
+	SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425),
+	SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6),
+	SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8),
+	SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f),
+	SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b),
+	SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e),
+	SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48),
+	SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e),
+	SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757),
+	SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1),
+	SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1),
+	SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411),
+	SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e),
+	SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e),
+	SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce),
+	SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df),
+	SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289),
+	SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6),
+	SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41),
+	SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f),
+	SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb),
+	SPH_C32(0x16625816)
+};
+
+#define TIX2(q, x00, x01, x08, x10, x24)   do { \
+		x10 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x24; \
+	} while (0)
+
+#define TIX3(q, x00, x01, x04, x08, x16, x27, x30)   do { \
+		x16 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x27; \
+		x04 ^= x30; \
+	} while (0)
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30)   do { \
+		x22 ^= x00; \
+		x00 = (q); \
+		x08 ^= x00; \
+		x01 ^= x24; \
+		x04 ^= x27; \
+		x07 ^= x30; \
+	} while (0)
+
+#define CMIX30(x00, x01, x02, x04, x05, x06, x15, x16, x17)   do { \
+		x00 ^= x04; \
+		x01 ^= x05; \
+		x02 ^= x06; \
+		x15 ^= x04; \
+		x16 ^= x05; \
+		x17 ^= x06; \
+	} while (0)
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20)   do { \
+		x00 ^= x04; \
+		x01 ^= x05; \
+		x02 ^= x06; \
+		x18 ^= x04; \
+		x19 ^= x05; \
+		x20 ^= x06; \
+	} while (0)
+
+#define SMIX(x0, x1, x2, x3)   do { \
+		sph_u32 c0 = 0; \
+		sph_u32 c1 = 0; \
+		sph_u32 c2 = 0; \
+		sph_u32 c3 = 0; \
+		sph_u32 r0 = 0; \
+		sph_u32 r1 = 0; \
+		sph_u32 r2 = 0; \
+		sph_u32 r3 = 0; \
+		sph_u32 tmp; \
+		tmp = mixtab0[x0 >> 24]; \
+		c0 ^= tmp; \
+		tmp = mixtab1[(x0 >> 16) & 0xFF]; \
+		c0 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2[(x0 >>  8) & 0xFF]; \
+		c0 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3[x0 & 0xFF]; \
+		c0 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0[x1 >> 24]; \
+		c1 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1[(x1 >> 16) & 0xFF]; \
+		c1 ^= tmp; \
+		tmp = mixtab2[(x1 >>  8) & 0xFF]; \
+		c1 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3[x1 & 0xFF]; \
+		c1 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0[x2 >> 24]; \
+		c2 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1[(x2 >> 16) & 0xFF]; \
+		c2 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2[(x2 >>  8) & 0xFF]; \
+		c2 ^= tmp; \
+		tmp = mixtab3[x2 & 0xFF]; \
+		c2 ^= tmp; \
+		r3 ^= tmp; \
+		tmp = mixtab0[x3 >> 24]; \
+		c3 ^= tmp; \
+		r0 ^= tmp; \
+		tmp = mixtab1[(x3 >> 16) & 0xFF]; \
+		c3 ^= tmp; \
+		r1 ^= tmp; \
+		tmp = mixtab2[(x3 >>  8) & 0xFF]; \
+		c3 ^= tmp; \
+		r2 ^= tmp; \
+		tmp = mixtab3[x3 & 0xFF]; \
+		c3 ^= tmp; \
+		x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \
+			| ((c1 ^ r1) & SPH_C32(0x00FF0000)) \
+			| ((c2 ^ r2) & SPH_C32(0x0000FF00)) \
+			| ((c3 ^ r3) & SPH_C32(0x000000FF)); \
+		x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \
+			| ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \
+			| ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \
+			| ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \
+		x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \
+			| ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \
+			| ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \
+			| ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \
+		x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \
+			| ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \
+			| ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \
+			| ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \
+		/* */ \
+	} while (0)
+
+#if SPH_FUGUE_NOCOPY
+
+#define DECL_STATE_SMALL
+#define READ_STATE_SMALL(state)
+#define WRITE_STATE_SMALL(state)
+#define DECL_STATE_BIG
+#define READ_STATE_BIG(state)
+#define WRITE_STATE_BIG(state)
+
+#define S00   ((sc)->S[ 0])
+#define S01   ((sc)->S[ 1])
+#define S02   ((sc)->S[ 2])
+#define S03   ((sc)->S[ 3])
+#define S04   ((sc)->S[ 4])
+#define S05   ((sc)->S[ 5])
+#define S06   ((sc)->S[ 6])
+#define S07   ((sc)->S[ 7])
+#define S08   ((sc)->S[ 8])
+#define S09   ((sc)->S[ 9])
+#define S10   ((sc)->S[10])
+#define S11   ((sc)->S[11])
+#define S12   ((sc)->S[12])
+#define S13   ((sc)->S[13])
+#define S14   ((sc)->S[14])
+#define S15   ((sc)->S[15])
+#define S16   ((sc)->S[16])
+#define S17   ((sc)->S[17])
+#define S18   ((sc)->S[18])
+#define S19   ((sc)->S[19])
+#define S20   ((sc)->S[20])
+#define S21   ((sc)->S[21])
+#define S22   ((sc)->S[22])
+#define S23   ((sc)->S[23])
+#define S24   ((sc)->S[24])
+#define S25   ((sc)->S[25])
+#define S26   ((sc)->S[26])
+#define S27   ((sc)->S[27])
+#define S28   ((sc)->S[28])
+#define S29   ((sc)->S[29])
+#define S30   ((sc)->S[30])
+#define S31   ((sc)->S[31])
+#define S32   ((sc)->S[32])
+#define S33   ((sc)->S[33])
+#define S34   ((sc)->S[34])
+#define S35   ((sc)->S[35])
+
+#else
+
+#define DECL_STATE_SMALL \
+	sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; \
+	sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; \
+	sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
+
+#define DECL_STATE_BIG \
+	DECL_STATE_SMALL \
+	sph_u32 S30, S31, S32, S33, S34, S35;
+
+#define READ_STATE_SMALL(state)   do { \
+		S00 = (state)->S[ 0]; \
+		S01 = (state)->S[ 1]; \
+		S02 = (state)->S[ 2]; \
+		S03 = (state)->S[ 3]; \
+		S04 = (state)->S[ 4]; \
+		S05 = (state)->S[ 5]; \
+		S06 = (state)->S[ 6]; \
+		S07 = (state)->S[ 7]; \
+		S08 = (state)->S[ 8]; \
+		S09 = (state)->S[ 9]; \
+		S10 = (state)->S[10]; \
+		S11 = (state)->S[11]; \
+		S12 = (state)->S[12]; \
+		S13 = (state)->S[13]; \
+		S14 = (state)->S[14]; \
+		S15 = (state)->S[15]; \
+		S16 = (state)->S[16]; \
+		S17 = (state)->S[17]; \
+		S18 = (state)->S[18]; \
+		S19 = (state)->S[19]; \
+		S20 = (state)->S[20]; \
+		S21 = (state)->S[21]; \
+		S22 = (state)->S[22]; \
+		S23 = (state)->S[23]; \
+		S24 = (state)->S[24]; \
+		S25 = (state)->S[25]; \
+		S26 = (state)->S[26]; \
+		S27 = (state)->S[27]; \
+		S28 = (state)->S[28]; \
+		S29 = (state)->S[29]; \
+	} while (0)
+
+#define READ_STATE_BIG(state)   do { \
+		READ_STATE_SMALL(state); \
+		S30 = (state)->S[30]; \
+		S31 = (state)->S[31]; \
+		S32 = (state)->S[32]; \
+		S33 = (state)->S[33]; \
+		S34 = (state)->S[34]; \
+		S35 = (state)->S[35]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(state)   do { \
+		(state)->S[ 0] = S00; \
+		(state)->S[ 1] = S01; \
+		(state)->S[ 2] = S02; \
+		(state)->S[ 3] = S03; \
+		(state)->S[ 4] = S04; \
+		(state)->S[ 5] = S05; \
+		(state)->S[ 6] = S06; \
+		(state)->S[ 7] = S07; \
+		(state)->S[ 8] = S08; \
+		(state)->S[ 9] = S09; \
+		(state)->S[10] = S10; \
+		(state)->S[11] = S11; \
+		(state)->S[12] = S12; \
+		(state)->S[13] = S13; \
+		(state)->S[14] = S14; \
+		(state)->S[15] = S15; \
+		(state)->S[16] = S16; \
+		(state)->S[17] = S17; \
+		(state)->S[18] = S18; \
+		(state)->S[19] = S19; \
+		(state)->S[20] = S20; \
+		(state)->S[21] = S21; \
+		(state)->S[22] = S22; \
+		(state)->S[23] = S23; \
+		(state)->S[24] = S24; \
+		(state)->S[25] = S25; \
+		(state)->S[26] = S26; \
+		(state)->S[27] = S27; \
+		(state)->S[28] = S28; \
+		(state)->S[29] = S29; \
+	} while (0)
+
+#define WRITE_STATE_BIG(state)   do { \
+		WRITE_STATE_SMALL(state); \
+		(state)->S[30] = S30; \
+		(state)->S[31] = S31; \
+		(state)->S[32] = S32; \
+		(state)->S[33] = S33; \
+		(state)->S[34] = S34; \
+		(state)->S[35] = S35; \
+	} while (0)
+
+#endif
+
+static void
+fugue_init(sph_fugue_context *sc, size_t z_len,
+	const sph_u32 *iv, size_t iv_len)
+{
+	size_t u;
+
+	for (u = 0; u < z_len; u ++)
+		sc->S[u] = 0;
+	memcpy(&sc->S[z_len], iv, iv_len * sizeof *iv);
+	sc->partial = 0;
+	sc->partial_len = 0;
+	sc->round_shift = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+#if SPH_64
+
+#define INCR_COUNTER   do { \
+		sc->bit_count += (sph_u64)len << 3; \
+	} while (0)
+
+#else
+
+#define INCR_COUNTER   do { \
+		sph_u32 tmp = SPH_T32((sph_u32)len << 3); \
+		sc->bit_count_low = SPH_T32(sc->bit_count_low + tmp); \
+		if (sc->bit_count_low < tmp) \
+			sc->bit_count_high ++; \
+		sc->bit_count_high = SPH_T32(sc->bit_count_high \
+			+ ((sph_u32)len >> 29)); \
+	} while (0)
+
+#endif
+
+#define CORE_ENTRY \
+	sph_u32 p; \
+	unsigned plen, rshift; \
+	INCR_COUNTER; \
+	p = sc->partial; \
+	plen = sc->partial_len; \
+	if (plen < 4) { \
+		unsigned count = 4 - plen; \
+		if (len < count) \
+			count = len; \
+		plen += count; \
+		while (count -- > 0) { \
+			p = (p << 8) | *(const unsigned char *)data; \
+			data = (const unsigned char *)data + 1; \
+			len --; \
+		} \
+		if (len == 0) { \
+			sc->partial = p; \
+			sc->partial_len = plen; \
+			return; \
+		} \
+	}
+
+#define CORE_EXIT \
+	p = 0; \
+	sc->partial_len = (unsigned)len; \
+	while (len -- > 0) { \
+		p = (p << 8) | *(const unsigned char *)data; \
+		data = (const unsigned char *)data + 1; \
+	} \
+	sc->partial = p; \
+	sc->round_shift = rshift;
+
+/*
+ * Not in a do..while: the 'break' must exit the outer loop.
+ */
+#define NEXT(rc) \
+	if (len <= 4) { \
+		rshift = (rc); \
+		break; \
+	} \
+	p = sph_dec32be(data); \
+	data = (const unsigned char *)data + 4; \
+	len -= 4
+
+static void
+fugue2_core(sph_fugue_context *sc, const void *data, size_t len)
+{
+	DECL_STATE_SMALL
+	CORE_ENTRY
+	READ_STATE_SMALL(sc);
+	rshift = sc->round_shift;
+	switch (rshift) {
+		for (;;) {
+			sph_u32 q;
+
+		case 0:
+			q = p;
+			TIX2(q, S00, S01, S08, S10, S24);
+			CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
+			SMIX(S27, S28, S29, S00);
+			CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
+			SMIX(S24, S25, S26, S27);
+			NEXT(1);
+			/* fall through */
+		case 1:
+			q = p;
+			TIX2(q, S24, S25, S02, S04, S18);
+			CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
+			SMIX(S21, S22, S23, S24);
+			CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
+			SMIX(S18, S19, S20, S21);
+			NEXT(2);
+			/* fall through */
+		case 2:
+			q = p;
+			TIX2(q, S18, S19, S26, S28, S12);
+			CMIX30(S15, S16, S17, S19, S20, S21, S00, S01, S02);
+			SMIX(S15, S16, S17, S18);
+			CMIX30(S12, S13, S14, S16, S17, S18, S27, S28, S29);
+			SMIX(S12, S13, S14, S15);
+			NEXT(3);
+			/* fall through */
+		case 3:
+			q = p;
+			TIX2(q, S12, S13, S20, S22, S06);
+			CMIX30(S09, S10, S11, S13, S14, S15, S24, S25, S26);
+			SMIX(S09, S10, S11, S12);
+			CMIX30(S06, S07, S08, S10, S11, S12, S21, S22, S23);
+			SMIX(S06, S07, S08, S09);
+			NEXT(4);
+			/* fall through */
+		case 4:
+			q = p;
+			TIX2(q, S06, S07, S14, S16, S00);
+			CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
+			SMIX(S03, S04, S05, S06);
+			CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
+			SMIX(S00, S01, S02, S03);
+			NEXT(0);
+		}
+	}
+	CORE_EXIT
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+fugue3_core(sph_fugue_context *sc, const void *data, size_t len)
+{
+	DECL_STATE_BIG
+	CORE_ENTRY
+	READ_STATE_BIG(sc);
+	rshift = sc->round_shift;
+	switch (rshift) {
+		for (;;) {
+			sph_u32 q;
+
+		case 0:
+			q = p;
+			TIX3(q, S00, S01, S04, S08, S16, S27, S30);
+			CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17);
+			SMIX(S33, S34, S35, S00);
+			CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+			SMIX(S30, S31, S32, S33);
+			CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+			SMIX(S27, S28, S29, S30);
+			NEXT(1);
+			/* fall through */
+		case 1:
+			q = p;
+			TIX3(q, S27, S28, S31, S35, S07, S18, S21);
+			CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+			SMIX(S24, S25, S26, S27);
+			CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05);
+			SMIX(S21, S22, S23, S24);
+			CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+			SMIX(S18, S19, S20, S21);
+			NEXT(2);
+			/* fall through */
+		case 2:
+			q = p;
+			TIX3(q, S18, S19, S22, S26, S34, S09, S12);
+			CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+			SMIX(S15, S16, S17, S18);
+			CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+			SMIX(S12, S13, S14, S15);
+			CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29);
+			SMIX(S09, S10, S11, S12);
+			NEXT(3);
+			/* fall through */
+		case 3:
+			q = p;
+			TIX3(q, S09, S10, S13, S17, S25, S00, S03);
+			CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26);
+			SMIX(S06, S07, S08, S09);
+			CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23);
+			SMIX(S03, S04, S05, S06);
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+			NEXT(0);
+		}
+	}
+	CORE_EXIT
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+fugue4_core(sph_fugue_context *sc, const void *data, size_t len)
+{
+	DECL_STATE_BIG
+	CORE_ENTRY
+	READ_STATE_BIG(sc);
+	rshift = sc->round_shift;
+	switch (rshift) {
+		for (;;) {
+			sph_u32 q;
+
+		case 0:
+			q = p;
+			TIX4(q, S00, S01, S04, S07, S08, S22, S24, S27, S30);
+			CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17);
+			SMIX(S33, S34, S35, S00);
+			CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14);
+			SMIX(S30, S31, S32, S33);
+			CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11);
+			SMIX(S27, S28, S29, S30);
+			CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08);
+			SMIX(S24, S25, S26, S27);
+			NEXT(1);
+			/* fall through */
+		case 1:
+			q = p;
+			TIX4(q, S24, S25, S28, S31, S32, S10, S12, S15, S18);
+			CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05);
+			SMIX(S21, S22, S23, S24);
+			CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02);
+			SMIX(S18, S19, S20, S21);
+			CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35);
+			SMIX(S15, S16, S17, S18);
+			CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32);
+			SMIX(S12, S13, S14, S15);
+			NEXT(2);
+			/* fall through */
+		case 2:
+			q = p;
+			TIX4(q, S12, S13, S16, S19, S20, S34, S00, S03, S06);
+			CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29);
+			SMIX(S09, S10, S11, S12);
+			CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26);
+			SMIX(S06, S07, S08, S09);
+			CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23);
+			SMIX(S03, S04, S05, S06);
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+			NEXT(0);
+		}
+	}
+	CORE_EXIT
+	WRITE_STATE_BIG(sc);
+}
+
+#if SPH_64
+
+#define WRITE_COUNTER   do { \
+		sph_enc64be(buf + 4, sc->bit_count + n); \
+	} while (0)
+
+#else
+
+#define WRITE_COUNTER   do { \
+		sph_enc32be(buf + 4, sc->bit_count_high); \
+		sph_enc32be(buf + 8, sc->bit_count_low + n); \
+	} while (0)
+
+#endif
+
+#define CLOSE_ENTRY(s, rcm, core) \
+	unsigned char buf[16]; \
+	unsigned plen, rms; \
+	unsigned char *out; \
+	sph_u32 S[s]; \
+	plen = sc->partial_len; \
+	WRITE_COUNTER; \
+	if (plen == 0 && n == 0) { \
+		plen = 4; \
+	} else if (plen < 4 || n != 0) { \
+		unsigned u; \
+ \
+		if (plen == 4) \
+			plen = 0; \
+		buf[plen] = ub & ~(0xFFU >> n); \
+		for (u = plen + 1; u < 4; u ++) \
+			buf[u] = 0; \
+	} \
+	core(sc, buf + plen, (sizeof buf) - plen); \
+	rms = sc->round_shift * (rcm); \
+	memcpy(S, sc->S + (s) - rms, rms * sizeof(sph_u32)); \
+	memcpy(S + rms, sc->S, ((s) - rms) * sizeof(sph_u32));
+
+#define ROR(n, s)   do { \
+		sph_u32 tmp[n]; \
+		memcpy(tmp, S + ((s) - (n)), (n) * sizeof(sph_u32)); \
+		memmove(S + (n), S, ((s) - (n)) * sizeof(sph_u32)); \
+		memcpy(S, tmp, (n) * sizeof(sph_u32)); \
+	} while (0)
+
+static void
+fugue2_close(sph_fugue_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	int i;
+
+	CLOSE_ENTRY(30, 6, fugue2_core)
+	for (i = 0; i < 10; i ++) {
+		ROR(3, 30);
+		CMIX30(S[0], S[1], S[2], S[4], S[5], S[6], S[15], S[16], S[17]);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	for (i = 0; i < 13; i ++) {
+		S[4] ^= S[0];
+		S[15] ^= S[0];
+		ROR(15, 30);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[16] ^= S[0];
+		ROR(14, 30);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	S[4] ^= S[0];
+	S[15] ^= S[0];
+	out = dst;
+	sph_enc32be(out +  0, S[ 1]);
+	sph_enc32be(out +  4, S[ 2]);
+	sph_enc32be(out +  8, S[ 3]);
+	sph_enc32be(out + 12, S[ 4]);
+	sph_enc32be(out + 16, S[15]);
+	sph_enc32be(out + 20, S[16]);
+	sph_enc32be(out + 24, S[17]);
+	if (out_size_w32 == 8) {
+		sph_enc32be(out + 28, S[18]);
+		sph_fugue256_init(sc);
+	} else {
+		sph_fugue224_init(sc);
+	}
+}
+
+static void
+fugue3_close(sph_fugue_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	int i;
+
+	CLOSE_ENTRY(36, 9, fugue3_core)
+	for (i = 0; i < 18; i ++) {
+		ROR(3, 36);
+		CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	for (i = 0; i < 13; i ++) {
+		S[4] ^= S[0];
+		S[12] ^= S[0];
+		S[24] ^= S[0];
+		ROR(12, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[13] ^= S[0];
+		S[24] ^= S[0];
+		ROR(12, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[13] ^= S[0];
+		S[25] ^= S[0];
+		ROR(11, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	S[4] ^= S[0];
+	S[12] ^= S[0];
+	S[24] ^= S[0];
+	out = dst;
+	sph_enc32be(out +  0, S[ 1]);
+	sph_enc32be(out +  4, S[ 2]);
+	sph_enc32be(out +  8, S[ 3]);
+	sph_enc32be(out + 12, S[ 4]);
+	sph_enc32be(out + 16, S[12]);
+	sph_enc32be(out + 20, S[13]);
+	sph_enc32be(out + 24, S[14]);
+	sph_enc32be(out + 28, S[15]);
+	sph_enc32be(out + 32, S[24]);
+	sph_enc32be(out + 36, S[25]);
+	sph_enc32be(out + 40, S[26]);
+	sph_enc32be(out + 44, S[27]);
+	sph_fugue384_init(sc);
+}
+
+static void
+fugue4_close(sph_fugue_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	int i;
+
+	CLOSE_ENTRY(36, 12, fugue4_core)
+	for (i = 0; i < 32; i ++) {
+		ROR(3, 36);
+		CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	for (i = 0; i < 13; i ++) {
+		S[4] ^= S[0];
+		S[9] ^= S[0];
+		S[18] ^= S[0];
+		S[27] ^= S[0];
+		ROR(9, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[10] ^= S[0];
+		S[18] ^= S[0];
+		S[27] ^= S[0];
+		ROR(9, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[10] ^= S[0];
+		S[19] ^= S[0];
+		S[27] ^= S[0];
+		ROR(9, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+		S[4] ^= S[0];
+		S[10] ^= S[0];
+		S[19] ^= S[0];
+		S[28] ^= S[0];
+		ROR(8, 36);
+		SMIX(S[0], S[1], S[2], S[3]);
+	}
+	S[4] ^= S[0];
+	S[9] ^= S[0];
+	S[18] ^= S[0];
+	S[27] ^= S[0];
+	out = dst;
+	sph_enc32be(out +  0, S[ 1]);
+	sph_enc32be(out +  4, S[ 2]);
+	sph_enc32be(out +  8, S[ 3]);
+	sph_enc32be(out + 12, S[ 4]);
+	sph_enc32be(out + 16, S[ 9]);
+	sph_enc32be(out + 20, S[10]);
+	sph_enc32be(out + 24, S[11]);
+	sph_enc32be(out + 28, S[12]);
+	sph_enc32be(out + 32, S[18]);
+	sph_enc32be(out + 36, S[19]);
+	sph_enc32be(out + 40, S[20]);
+	sph_enc32be(out + 44, S[21]);
+	sph_enc32be(out + 48, S[27]);
+	sph_enc32be(out + 52, S[28]);
+	sph_enc32be(out + 56, S[29]);
+	sph_enc32be(out + 60, S[30]);
+	sph_fugue512_init(sc);
+}
+
+void
+sph_fugue224_init(void *cc)
+{
+	fugue_init(cc, 23, IV224, 7);
+}
+
+void
+sph_fugue224(void *cc, const void *data, size_t len)
+{
+	fugue2_core(cc, data, len);
+}
+
+void
+sph_fugue224_close(void *cc, void *dst)
+{
+	fugue2_close(cc, 0, 0, dst, 7);
+}
+
+void
+sph_fugue224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	fugue2_close(cc, ub, n, dst, 7);
+}
+
+void
+sph_fugue256_init(void *cc)
+{
+	fugue_init(cc, 22, IV256, 8);
+}
+
+void
+sph_fugue256(void *cc, const void *data, size_t len)
+{
+	fugue2_core(cc, data, len);
+}
+
+void
+sph_fugue256_close(void *cc, void *dst)
+{
+	fugue2_close(cc, 0, 0, dst, 8);
+}
+
+void
+sph_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	fugue2_close(cc, ub, n, dst, 8);
+}
+
+void
+sph_fugue384_init(void *cc)
+{
+	fugue_init(cc, 24, IV384, 12);
+}
+
+void
+sph_fugue384(void *cc, const void *data, size_t len)
+{
+	fugue3_core(cc, data, len);
+}
+
+void
+sph_fugue384_close(void *cc, void *dst)
+{
+	fugue3_close(cc, 0, 0, dst);
+}
+
+void
+sph_fugue384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	fugue3_close(cc, ub, n, dst);
+}
+
+void
+sph_fugue512_init(void *cc)
+{
+	fugue_init(cc, 20, IV512, 16);
+}
+
+void
+sph_fugue512(void *cc, const void *data, size_t len)
+{
+	fugue4_core(cc, data, len);
+}
+
+void
+sph_fugue512_close(void *cc, void *dst)
+{
+	fugue4_close(cc, 0, 0, dst);
+}
+
+void
+sph_fugue512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	fugue4_close(cc, ub, n, dst);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
new file mode 100644
index 0000000..64c05d1
--- /dev/null
+++ b/fuguecoin.cpp
@@ -0,0 +1,74 @@
+#include "uint256.h"
+#include "sph_fugue.h"
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <string.h>
+#include <stdint.h>
+#include <cuda_fugue256.h>
+
+extern "C" void my_fugue256_init(void *cc);
+extern "C" void my_fugue256(void *cc, const void *data, size_t len);
+extern "C" void my_fugue256_close(void *cc, void *dst);
+extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+// vorbereitete Kontexte nach den ersten 80 Bytes
+sph_fugue256_context  ctx_fugue_const[8];
+
+#define SWAP32(x) \
+    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
+      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{	
+	uint32_t start_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t throughPut = 4096 * 128;
+
+	// init
+	static bool init[8] = { false, false, false, false, false, false, false, false };
+	if(!init[thr_id])
+	{
+		fugue256_cpu_init(thr_id, throughPut);
+		init[thr_id] = true;
+	}
+	
+	// Endian Drehung ist notwendig
+	uint32_t endiandata[20];
+	for (int kk=0; kk < 20; kk++)
+		be32enc(&endiandata[kk], pdata[kk]);
+
+	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
+	fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+
+	do {
+		// GPU
+		uint32_t foundNounce = 0xFFFFFFFF;
+		fugue256_cpu_hash(thr_id, throughPut, pdata[19], NULL, &foundNounce);
+
+		if(foundNounce < 0xffffffff)
+		{
+			uint32_t hash[8];
+			endiandata[19] = SWAP32(foundNounce);
+			sph_fugue256_context ctx_fugue;
+			sph_fugue256_init(&ctx_fugue);
+			sph_fugue256 (&ctx_fugue, endiandata, 80);
+			sph_fugue256_close(&ctx_fugue, &hash);
+
+			pdata[19] = foundNounce;
+			*hashes_done = SWAP32(foundNounce) - start_nonce + 1;
+			return 1;
+		}
+
+		if (pdata[19] + throughPut < pdata[19])
+			pdata[19] = max_nonce;
+		else pdata[19] += throughPut;
+
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = pdata[19] - start_nonce + 1;
+	return 0;
+}
diff --git a/groestl.c b/groestl.c
new file mode 100644
index 0000000..cc685f4
--- /dev/null
+++ b/groestl.c
@@ -0,0 +1,3123 @@
+/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
+/*
+ * Groestl implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_groestl.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL
+#define SPH_SMALL_FOOTPRINT_GROESTL   1
+#endif
+
+/*
+ * Apparently, the 32-bit-only version is not faster than the 64-bit
+ * version unless using the "small footprint" code on a 32-bit machine.
+ */
+#if !defined SPH_GROESTL_64
+#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE
+#define SPH_GROESTL_64   0
+#else
+#define SPH_GROESTL_64   1
+#endif
+#endif
+
+#if !SPH_64
+#undef SPH_GROESTL_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal representation may use either big-endian or
+ * little-endian. Using the platform default representation speeds up
+ * encoding and decoding between bytes and the matrix columns.
+ */
+
+#undef USE_LE
+#if SPH_GROESTL_LITTLE_ENDIAN
+#define USE_LE   1
+#elif SPH_GROESTL_BIG_ENDIAN
+#define USE_LE   0
+#elif SPH_LITTLE_ENDIAN
+#define USE_LE   1
+#endif
+
+#if USE_LE
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+#define B32_0(x)    ((x) & 0xFF)
+#define B32_1(x)    (((x) >> 8) & 0xFF)
+#define B32_2(x)    (((x) >> 16) & 0xFF)
+#define B32_3(x)    ((x) >> 24)
+
+#define R32u(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+#define R32d(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24)))
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#define B64_0(x)    ((x) & 0xFF)
+#define B64_1(x)    (((x) >> 8) & 0xFF)
+#define B64_2(x)    (((x) >> 16) & 0xFF)
+#define B64_3(x)    (((x) >> 24) & 0xFF)
+#define B64_4(x)    (((x) >> 32) & 0xFF)
+#define B64_5(x)    (((x) >> 40) & 0xFF)
+#define B64_6(x)    (((x) >> 48) & 0xFF)
+#define B64_7(x)    ((x) >> 56)
+#define R64         SPH_ROTL64
+#define PC64(j, r)  ((sph_u64)((j) + (r)))
+#define QC64(j, r)  (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56)))
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#define B32_0(x)    ((x) >> 24)
+#define B32_1(x)    (((x) >> 16) & 0xFF)
+#define B32_2(x)    (((x) >> 8) & 0xFF)
+#define B32_3(x)    ((x) & 0xFF)
+
+#define R32u(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+#define R32d(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)) << 24)
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j)))
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#define B64_0(x)    ((x) >> 56)
+#define B64_1(x)    (((x) >> 48) & 0xFF)
+#define B64_2(x)    (((x) >> 40) & 0xFF)
+#define B64_3(x)    (((x) >> 32) & 0xFF)
+#define B64_4(x)    (((x) >> 24) & 0xFF)
+#define B64_5(x)    (((x) >> 16) & 0xFF)
+#define B64_6(x)    (((x) >> 8) & 0xFF)
+#define B64_7(x)    ((x) & 0xFF)
+#define R64         SPH_ROTR64
+#define PC64(j, r)  ((sph_u64)((j) + (r)) << 56)
+#define QC64(j, r)  ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j)))
+#endif
+
+#endif
+
+#if SPH_GROESTL_64
+
+static const sph_u64 T0[] = {
+	C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8),
+	C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6),
+	C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6),
+	C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491),
+	C64e(0x6090f050f0c05060), C64e(0x0207050305040302),
+	C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56),
+	C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5),
+	C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec),
+	C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f),
+	C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa),
+	C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2),
+	C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb),
+	C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3),
+	C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45),
+	C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753),
+	C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b),
+	C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1),
+	C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c),
+	C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e),
+	C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83),
+	C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451),
+	C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9),
+	C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab),
+	C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a),
+	C64e(0x081c140c14100c08), C64e(0x9563f652f6315295),
+	C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d),
+	C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137),
+	C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f),
+	C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624),
+	C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf),
+	C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e),
+	C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea),
+	C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d),
+	C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34),
+	C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc),
+	C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b),
+	C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76),
+	C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d),
+	C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd),
+	C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713),
+	C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9),
+	C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1),
+	C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3),
+	C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6),
+	C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d),
+	C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72),
+	C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498),
+	C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85),
+	C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5),
+	C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed),
+	C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a),
+	C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411),
+	C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9),
+	C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe),
+	C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478),
+	C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b),
+	C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d),
+	C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05),
+	C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21),
+	C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1),
+	C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177),
+	C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342),
+	C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5),
+	C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf),
+	C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418),
+	C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3),
+	C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235),
+	C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e),
+	C64e(0x936af957f93d5793), C64e(0x55580df20daaf255),
+	C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a),
+	C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba),
+	C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6),
+	C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819),
+	C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3),
+	C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54),
+	C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b),
+	C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7),
+	C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28),
+	C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc),
+	C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad),
+	C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664),
+	C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14),
+	C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c),
+	C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8),
+	C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd),
+	C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4),
+	C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431),
+	C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2),
+	C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b),
+	C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da),
+	C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1),
+	C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049),
+	C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac),
+	C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf),
+	C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4),
+	C64e(0x476720e9208ee947), C64e(0x1038281828201810),
+	C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0),
+	C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c),
+	C64e(0x38546c246c702438), C64e(0x575f08f108aef157),
+	C64e(0x732152c752e6c773), C64e(0x9764f351f3355197),
+	C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1),
+	C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e),
+	C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61),
+	C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f),
+	C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c),
+	C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc),
+	C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506),
+	C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c),
+	C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a),
+	C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069),
+	C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899),
+	C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927),
+	C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb),
+	C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322),
+	C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9),
+	C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733),
+	C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c),
+	C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9),
+	C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa),
+	C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5),
+	C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859),
+	C64e(0x09929b809b128009), C64e(0x1a2339173934171a),
+	C64e(0x651075da75cada65), C64e(0xd784533153b531d7),
+	C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0),
+	C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029),
+	C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e),
+	C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8),
+	C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T1[] = {
+	C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84),
+	C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d),
+	C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd),
+	C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954),
+	C64e(0x606090f050f0c050), C64e(0x0202070503050403),
+	C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d),
+	C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162),
+	C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a),
+	C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d),
+	C64e(0x898949c040c00940), C64e(0xfafa68928792ef87),
+	C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb),
+	C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b),
+	C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67),
+	C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea),
+	C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7),
+	C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b),
+	C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c),
+	C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a),
+	C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41),
+	C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f),
+	C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4),
+	C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908),
+	C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73),
+	C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f),
+	C64e(0x08081c140c14100c), C64e(0x959563f652f63152),
+	C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e),
+	C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1),
+	C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5),
+	C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836),
+	C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d),
+	C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69),
+	C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f),
+	C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e),
+	C64e(0x5858c49c749cb074), C64e(0x343446722e72682e),
+	C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2),
+	C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb),
+	C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d),
+	C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face),
+	C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e),
+	C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697),
+	C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968),
+	C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c),
+	C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f),
+	C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed),
+	C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146),
+	C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b),
+	C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4),
+	C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a),
+	C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a),
+	C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116),
+	C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7),
+	C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294),
+	C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910),
+	C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781),
+	C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044),
+	C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3),
+	C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe),
+	C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a),
+	C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc),
+	C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904),
+	C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1),
+	C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463),
+	C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a),
+	C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d),
+	C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014),
+	C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f),
+	C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2),
+	C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39),
+	C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2),
+	C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447),
+	C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7),
+	C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795),
+	C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298),
+	C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f),
+	C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e),
+	C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683),
+	C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529),
+	C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c),
+	C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2),
+	C64e(0x161631271d272c1d), C64e(0xadad379a769a4176),
+	C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856),
+	C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e),
+	C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a),
+	C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4),
+	C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e),
+	C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6),
+	C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4),
+	C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b),
+	C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43),
+	C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7),
+	C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964),
+	C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0),
+	C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa),
+	C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525),
+	C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e),
+	C64e(0x47476720e9208ee9), C64e(0x1010382818282018),
+	C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88),
+	C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872),
+	C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1),
+	C64e(0x73732152c752e6c7), C64e(0x979764f351f33551),
+	C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c),
+	C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21),
+	C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc),
+	C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85),
+	C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842),
+	C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa),
+	C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05),
+	C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812),
+	C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f),
+	C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0),
+	C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958),
+	C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9),
+	C64e(0xd9d991483848a938), C64e(0xebebde351335cd13),
+	C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433),
+	C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970),
+	C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7),
+	C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822),
+	C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920),
+	C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff),
+	C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a),
+	C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8),
+	C64e(0x0909929b809b1280), C64e(0x1a1a233917393417),
+	C64e(0x65651075da75cada), C64e(0xd7d784533153b531),
+	C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8),
+	C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0),
+	C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11),
+	C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc),
+	C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a)
+};
+
+static const sph_u64 T2[] = {
+	C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb),
+	C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7),
+	C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7),
+	C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39),
+	C64e(0x50606090f050f0c0), C64e(0x0302020705030504),
+	C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac),
+	C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671),
+	C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3),
+	C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e),
+	C64e(0x40898949c040c009), C64e(0x87fafa68928792ef),
+	C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f),
+	C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded),
+	C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d),
+	C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a),
+	C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6),
+	C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d),
+	C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9),
+	C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98),
+	C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc),
+	C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d),
+	C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2),
+	C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9),
+	C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d),
+	C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154),
+	C64e(0x0c08081c140c1410), C64e(0x52959563f652f631),
+	C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221),
+	C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e),
+	C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e),
+	C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48),
+	C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5),
+	C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c),
+	C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf),
+	C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a),
+	C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268),
+	C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3),
+	C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6),
+	C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec),
+	C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa),
+	C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1),
+	C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226),
+	C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869),
+	C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499),
+	C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd),
+	C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77),
+	C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01),
+	C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4),
+	C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b),
+	C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11),
+	C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91),
+	C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1),
+	C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f),
+	C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722),
+	C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9),
+	C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7),
+	C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0),
+	C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96),
+	C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba),
+	C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a),
+	C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42),
+	C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9),
+	C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee),
+	C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584),
+	C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1),
+	C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765),
+	C64e(0x4c818155d44cd419), C64e(0x141818243c143c30),
+	C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d),
+	C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a),
+	C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c),
+	C64e(0x5793936af957f93d), C64e(0xf25555580df20daa),
+	C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4),
+	C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f),
+	C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7),
+	C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332),
+	C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d),
+	C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8),
+	C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16),
+	C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95),
+	C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450),
+	C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63),
+	C64e(0x1d161631271d272c), C64e(0x76adad379a769a41),
+	C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8),
+	C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228),
+	C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18),
+	C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b),
+	C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261),
+	C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193),
+	C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762),
+	C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff),
+	C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d),
+	C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af),
+	C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79),
+	C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92),
+	C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543),
+	C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85),
+	C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3),
+	C64e(0xe947476720e9208e), C64e(0x1810103828182820),
+	C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb),
+	C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8),
+	C64e(0x243838546c246c70), C64e(0xf157575f08f108ae),
+	C64e(0xc773732152c752e6), C64e(0x51979764f351f335),
+	C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459),
+	C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c),
+	C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2),
+	C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e),
+	C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8),
+	C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583),
+	C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c),
+	C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638),
+	C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4),
+	C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2),
+	C64e(0x911717bfa891a82e), C64e(0x58999971e858e829),
+	C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e),
+	C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd),
+	C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544),
+	C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049),
+	C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266),
+	C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678),
+	C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089),
+	C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f),
+	C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51),
+	C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2),
+	C64e(0x800909929b809b12), C64e(0x171a1a2339173934),
+	C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5),
+	C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb),
+	C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52),
+	C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c),
+	C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b),
+	C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58)
+};
+
+static const sph_u64 T3[] = {
+	C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497),
+	C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c),
+	C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc),
+	C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc),
+	C64e(0xc050606090f050f0), C64e(0x0403020207050305),
+	C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87),
+	C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6),
+	C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5),
+	C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc),
+	C64e(0x0940898949c040c0), C64e(0xef87fafa68928792),
+	C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26),
+	C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d),
+	C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9),
+	C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25),
+	C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702),
+	C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed),
+	C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24),
+	C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe),
+	C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3),
+	C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1),
+	C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407),
+	C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818),
+	C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395),
+	C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41),
+	C64e(0x100c08081c140c14), C64e(0x3152959563f652f6),
+	C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2),
+	C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8),
+	C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4),
+	C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a),
+	C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47),
+	C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb),
+	C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba),
+	C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9),
+	C64e(0xb0745858c49c749c), C64e(0x682e343446722e72),
+	C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd),
+	C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16),
+	C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7),
+	C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49),
+	C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42),
+	C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2),
+	C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8),
+	C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74),
+	C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21),
+	C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c),
+	C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca),
+	C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd),
+	C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467),
+	C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade),
+	C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e),
+	C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a),
+	C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762),
+	C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7),
+	C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030),
+	C64e(0x080604040e0a060a), C64e(0xe781fefe66988198),
+	C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc),
+	C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e),
+	C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19),
+	C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85),
+	C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf),
+	C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c),
+	C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158),
+	C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5),
+	C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e),
+	C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7),
+	C64e(0x194c818155d44cd4), C64e(0x30141818243c143c),
+	C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71),
+	C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd),
+	C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b),
+	C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d),
+	C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9),
+	C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732),
+	C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4),
+	C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3),
+	C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81),
+	C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82),
+	C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e),
+	C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b),
+	C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44),
+	C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d),
+	C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a),
+	C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa),
+	C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22),
+	C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e),
+	C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437),
+	C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2),
+	C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1),
+	C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7),
+	C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86),
+	C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5),
+	C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2),
+	C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac),
+	C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b),
+	C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15),
+	C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f),
+	C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89),
+	C64e(0x8ee947476720e920), C64e(0x2018101038281828),
+	C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883),
+	C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296),
+	C64e(0x70243838546c246c), C64e(0xaef157575f08f108),
+	C64e(0xe6c773732152c752), C64e(0x3551979764f351f3),
+	C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84),
+	C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163),
+	C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f),
+	C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594),
+	C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6),
+	C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5),
+	C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f),
+	C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236),
+	C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1),
+	C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b),
+	C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8),
+	C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0),
+	C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335),
+	C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355),
+	C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090),
+	C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2),
+	C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266),
+	C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060),
+	C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a),
+	C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e),
+	C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813),
+	C64e(0x12800909929b809b), C64e(0x34171a1a23391739),
+	C64e(0xcada65651075da75), C64e(0xb531d7d784533153),
+	C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3),
+	C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb),
+	C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133),
+	C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f),
+	C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e)
+};
+
+#endif
+
+static const sph_u64 T4[] = {
+	C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784),
+	C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d),
+	C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd),
+	C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54),
+	C64e(0xf0c050606090f050), C64e(0x0504030202070503),
+	C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d),
+	C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662),
+	C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a),
+	C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d),
+	C64e(0xc00940898949c040), C64e(0x92ef87fafa689287),
+	C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb),
+	C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b),
+	C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967),
+	C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea),
+	C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7),
+	C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b),
+	C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c),
+	C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a),
+	C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341),
+	C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f),
+	C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4),
+	C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808),
+	C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573),
+	C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f),
+	C64e(0x14100c08081c140c), C64e(0xf63152959563f652),
+	C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e),
+	C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1),
+	C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5),
+	C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36),
+	C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d),
+	C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69),
+	C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f),
+	C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e),
+	C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e),
+	C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2),
+	C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb),
+	C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d),
+	C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce),
+	C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e),
+	C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297),
+	C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868),
+	C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c),
+	C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f),
+	C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced),
+	C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46),
+	C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b),
+	C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4),
+	C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a),
+	C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a),
+	C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16),
+	C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7),
+	C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794),
+	C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010),
+	C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881),
+	C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44),
+	C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3),
+	C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe),
+	C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a),
+	C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc),
+	C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04),
+	C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1),
+	C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563),
+	C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a),
+	C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d),
+	C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14),
+	C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f),
+	C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2),
+	C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39),
+	C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2),
+	C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947),
+	C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7),
+	C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495),
+	C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398),
+	C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f),
+	C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e),
+	C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83),
+	C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29),
+	C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c),
+	C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2),
+	C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76),
+	C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56),
+	C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e),
+	C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a),
+	C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4),
+	C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e),
+	C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6),
+	C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4),
+	C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b),
+	C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543),
+	C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7),
+	C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64),
+	C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0),
+	C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa),
+	C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25),
+	C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e),
+	C64e(0x208ee947476720e9), C64e(0x2820181010382818),
+	C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388),
+	C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672),
+	C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1),
+	C64e(0x52e6c773732152c7), C64e(0xf33551979764f351),
+	C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c),
+	C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321),
+	C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc),
+	C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485),
+	C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642),
+	C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa),
+	C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05),
+	C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612),
+	C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f),
+	C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0),
+	C64e(0xa82e911717bfa891), C64e(0xe82958999971e858),
+	C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9),
+	C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513),
+	C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533),
+	C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070),
+	C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7),
+	C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622),
+	C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020),
+	C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff),
+	C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a),
+	C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8),
+	C64e(0x9b12800909929b80), C64e(0x3934171a1a233917),
+	C64e(0x75cada65651075da), C64e(0x53b531d7d7845331),
+	C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8),
+	C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0),
+	C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311),
+	C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc),
+	C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T5[] = {
+	C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97),
+	C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c),
+	C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc),
+	C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc),
+	C64e(0x50f0c050606090f0), C64e(0x0305040302020705),
+	C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187),
+	C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6),
+	C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5),
+	C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc),
+	C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892),
+	C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426),
+	C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d),
+	C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9),
+	C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025),
+	C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102),
+	C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed),
+	C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524),
+	C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be),
+	C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3),
+	C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1),
+	C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607),
+	C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118),
+	C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95),
+	C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41),
+	C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6),
+	C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2),
+	C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8),
+	C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4),
+	C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a),
+	C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847),
+	C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb),
+	C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba),
+	C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9),
+	C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672),
+	C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd),
+	C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16),
+	C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7),
+	C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449),
+	C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42),
+	C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2),
+	C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8),
+	C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574),
+	C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221),
+	C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c),
+	C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca),
+	C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd),
+	C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67),
+	C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde),
+	C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e),
+	C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a),
+	C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862),
+	C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7),
+	C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930),
+	C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698),
+	C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc),
+	C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e),
+	C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419),
+	C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085),
+	C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf),
+	C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c),
+	C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58),
+	C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5),
+	C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e),
+	C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7),
+	C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c),
+	C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271),
+	C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd),
+	C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b),
+	C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d),
+	C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9),
+	C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832),
+	C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4),
+	C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3),
+	C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281),
+	C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682),
+	C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e),
+	C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b),
+	C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44),
+	C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d),
+	C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a),
+	C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa),
+	C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622),
+	C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e),
+	C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37),
+	C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2),
+	C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1),
+	C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7),
+	C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486),
+	C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5),
+	C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2),
+	C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac),
+	C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b),
+	C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915),
+	C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f),
+	C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89),
+	C64e(0xe9208ee947476720), C64e(0x1828201810103828),
+	C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383),
+	C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96),
+	C64e(0x246c70243838546c), C64e(0xf108aef157575f08),
+	C64e(0xc752e6c773732152), C64e(0x51f33551979764f3),
+	C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584),
+	C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63),
+	C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f),
+	C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94),
+	C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6),
+	C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5),
+	C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f),
+	C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36),
+	C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1),
+	C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b),
+	C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8),
+	C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0),
+	C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35),
+	C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755),
+	C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990),
+	C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2),
+	C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66),
+	C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960),
+	C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a),
+	C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e),
+	C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13),
+	C64e(0x809b12800909929b), C64e(0x173934171a1a2339),
+	C64e(0xda75cada65651075), C64e(0x3153b531d7d78453),
+	C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3),
+	C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb),
+	C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33),
+	C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f),
+	C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e)
+};
+
+static const sph_u64 T6[] = {
+	C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f),
+	C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a),
+	C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a),
+	C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d),
+	C64e(0xf050f0c050606090), C64e(0x0503050403020207),
+	C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1),
+	C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513),
+	C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59),
+	C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3),
+	C64e(0xc040c00940898949), C64e(0x928792ef87fafa68),
+	C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294),
+	C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6),
+	C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a),
+	C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560),
+	C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351),
+	C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76),
+	C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5),
+	C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2),
+	C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd),
+	C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352),
+	C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156),
+	C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1),
+	C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e),
+	C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b),
+	C64e(0x140c14100c08081c), C64e(0xf652f63152959563),
+	C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f),
+	C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf),
+	C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb),
+	C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e),
+	C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98),
+	C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5),
+	C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50),
+	C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4),
+	C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446),
+	C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11),
+	C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d),
+	C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1),
+	C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34),
+	C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f),
+	C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1),
+	C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901),
+	C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5),
+	C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2),
+	C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a),
+	C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47),
+	C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af),
+	C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff),
+	C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b),
+	C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb),
+	C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7),
+	C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8),
+	C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6),
+	C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9),
+	C64e(0x0a060a080604040e), C64e(0x988198e781fefe66),
+	C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4),
+	C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75),
+	C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44),
+	C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580),
+	C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe),
+	C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd),
+	C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f),
+	C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7),
+	C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb),
+	C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08),
+	C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824),
+	C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2),
+	C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8),
+	C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65),
+	C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558),
+	C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3),
+	C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88),
+	C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642),
+	C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa),
+	C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322),
+	C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6),
+	C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95),
+	C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc),
+	C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c),
+	C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81),
+	C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37),
+	C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e),
+	C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436),
+	C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12),
+	C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f),
+	C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f),
+	C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435),
+	C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6),
+	C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274),
+	C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e),
+	C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18),
+	C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d),
+	C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972),
+	C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9),
+	C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0),
+	C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d),
+	C64e(0x20e9208ee9474767), C64e(0x2818282018101038),
+	C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073),
+	C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca),
+	C64e(0x6c246c7024383854), C64e(0x08f108aef157575f),
+	C64e(0x52c752e6c7737321), C64e(0xf351f33551979764),
+	C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125),
+	C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d),
+	C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e),
+	C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b),
+	C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba),
+	C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29),
+	C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609),
+	C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a),
+	C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b),
+	C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902),
+	C64e(0xa891a82e911717bf), C64e(0xe858e82958999971),
+	C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7),
+	C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde),
+	C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277),
+	C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939),
+	C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1),
+	C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a),
+	C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9),
+	C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0),
+	C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b),
+	C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a),
+	C64e(0x9b809b1280090992), C64e(0x39173934171a1a23),
+	C64e(0x75da75cada656510), C64e(0x533153b531d7d784),
+	C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003),
+	C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2),
+	C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d),
+	C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7),
+	C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62)
+};
+
+static const sph_u64 T7[] = {
+	C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8),
+	C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6),
+	C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6),
+	C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191),
+	C64e(0x90f050f0c0506060), C64e(0x0705030504030202),
+	C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656),
+	C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5),
+	C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec),
+	C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f),
+	C64e(0x49c040c009408989), C64e(0x68928792ef87fafa),
+	C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2),
+	C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb),
+	C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3),
+	C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545),
+	C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353),
+	C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b),
+	C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1),
+	C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c),
+	C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e),
+	C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383),
+	C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151),
+	C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9),
+	C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab),
+	C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a),
+	C64e(0x1c140c14100c0808), C64e(0x63f652f631529595),
+	C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d),
+	C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737),
+	C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f),
+	C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424),
+	C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf),
+	C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e),
+	C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea),
+	C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d),
+	C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434),
+	C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc),
+	C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b),
+	C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676),
+	C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d),
+	C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd),
+	C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313),
+	C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9),
+	C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1),
+	C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3),
+	C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6),
+	C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d),
+	C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272),
+	C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898),
+	C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585),
+	C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5),
+	C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded),
+	C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a),
+	C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111),
+	C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9),
+	C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe),
+	C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878),
+	C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b),
+	C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d),
+	C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505),
+	C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121),
+	C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1),
+	C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777),
+	C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242),
+	C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5),
+	C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf),
+	C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818),
+	C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3),
+	C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535),
+	C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e),
+	C64e(0x6af957f93d579393), C64e(0x580df20daaf25555),
+	C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a),
+	C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba),
+	C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6),
+	C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919),
+	C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3),
+	C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454),
+	C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b),
+	C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7),
+	C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828),
+	C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc),
+	C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad),
+	C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464),
+	C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414),
+	C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c),
+	C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8),
+	C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd),
+	C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4),
+	C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131),
+	C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2),
+	C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b),
+	C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada),
+	C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1),
+	C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949),
+	C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac),
+	C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf),
+	C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4),
+	C64e(0x6720e9208ee94747), C64e(0x3828182820181010),
+	C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0),
+	C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c),
+	C64e(0x546c246c70243838), C64e(0x5f08f108aef15757),
+	C64e(0x2152c752e6c77373), C64e(0x64f351f335519797),
+	C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1),
+	C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e),
+	C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161),
+	C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f),
+	C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c),
+	C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc),
+	C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606),
+	C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c),
+	C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a),
+	C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969),
+	C64e(0xbfa891a82e911717), C64e(0x71e858e829589999),
+	C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727),
+	C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb),
+	C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222),
+	C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9),
+	C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333),
+	C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c),
+	C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9),
+	C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa),
+	C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5),
+	C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959),
+	C64e(0x929b809b12800909), C64e(0x2339173934171a1a),
+	C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7),
+	C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0),
+	C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929),
+	C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e),
+	C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8),
+	C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c)
+};
+
+#endif
+
+#define DECL_STATE_SMALL \
+	sph_u64 H[8];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= PC64(0x00, r); \
+		a[1] ^= PC64(0x10, r); \
+		a[2] ^= PC64(0x20, r); \
+		a[3] ^= PC64(0x30, r); \
+		a[4] ^= PC64(0x40, r); \
+		a[5] ^= PC64(0x50, r); \
+		a[6] ^= PC64(0x60, r); \
+		a[7] ^= PC64(0x70, r); \
+		RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \
+		RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \
+		RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \
+		RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \
+		RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \
+		RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \
+		RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \
+		RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= QC64(0x00, r); \
+		a[1] ^= QC64(0x10, r); \
+		a[2] ^= QC64(0x20, r); \
+		a[3] ^= QC64(0x30, r); \
+		a[4] ^= QC64(0x40, r); \
+		a[5] ^= QC64(0x50, r); \
+		a[6] ^= QC64(0x60, r); \
+		a[7] ^= QC64(0x70, r); \
+		RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \
+		RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \
+		RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \
+		RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \
+		RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \
+		RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \
+		RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \
+		RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+/*
+ * Apparently, unrolling more than that confuses GCC, resulting in
+ * lower performance, even though L1 cache would be no problem.
+ */
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u64 g[8], m[8]; \
+		size_t u; \
+		for (u = 0; u < 8; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u64 x[8]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u64 H[16];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \
+				(u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \
+			RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \
+				(u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \
+				(u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \
+			RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \
+				(u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \
+				(u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \
+			RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \
+				(u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \
+				(u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \
+			RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \
+				(u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \
+				(u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \
+			RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \
+				(u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \
+				(u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \
+			RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \
+				(u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \
+		RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \
+		RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \
+		RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \
+		RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \
+		RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \
+		RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \
+		RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \
+		RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \
+		RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \
+		RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \
+		RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \
+		RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \
+		RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \
+		RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \
+		RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \
+		RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \
+		RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \
+		RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \
+		RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \
+		RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \
+		RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \
+		RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \
+		RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \
+		RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \
+		RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \
+		RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \
+		RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \
+		RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \
+		RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \
+		RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#endif
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+/* obsolete
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16], *ya; \
+		const sph_u64 *yc; \
+		size_t u; \
+		int i; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		ya = g; \
+		yc = CP; \
+		for (i = 0; i < 2; i ++) { \
+			PERM_BIG(ya, yc); \
+			ya = m; \
+			yc = CQ; \
+		} \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+#else
+*/
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+/* obsolete
+#endif
+*/
+
+#define FINAL_BIG   do { \
+		sph_u64 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#else
+
+static const sph_u32 T0up[] = {
+	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
+	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
+	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
+	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
+	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
+	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
+	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
+	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
+	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
+	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
+	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
+	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
+	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
+	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
+	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
+	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
+	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
+	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
+	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
+	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
+	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
+	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
+	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
+	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
+	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
+	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
+	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
+	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
+	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
+	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
+	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
+	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
+	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
+	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
+	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
+	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
+	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
+	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
+	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
+	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
+	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
+	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
+	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
+	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
+	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
+	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
+	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
+	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
+	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
+	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
+	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
+	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
+	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
+	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
+	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
+	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
+	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
+	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
+	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
+	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
+	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
+	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
+	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
+	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
+};
+
+static const sph_u32 T0dn[] = {
+	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
+	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
+	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
+	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
+	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
+	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
+	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
+	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
+	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
+	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
+	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
+	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
+	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
+	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
+	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
+	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
+	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
+	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
+	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
+	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
+	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
+	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
+	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
+	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
+	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
+	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
+	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
+	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
+	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
+	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
+	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
+	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
+	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
+	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
+	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
+	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
+	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
+	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
+	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
+	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
+	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
+	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
+	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
+	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
+	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
+	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
+	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
+	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
+	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
+	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
+	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
+	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
+	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
+	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
+	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
+	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
+	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
+	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
+	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
+	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
+	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
+	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
+	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
+	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
+};
+
+static const sph_u32 T1up[] = {
+	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
+	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
+	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
+	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
+	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
+	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
+	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
+	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
+	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
+	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
+	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
+	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
+	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
+	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
+	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
+	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
+	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
+	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
+	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
+	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
+	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
+	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
+	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
+	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
+	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
+	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
+	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
+	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
+	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
+	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
+	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
+	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
+	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
+	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
+	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
+	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
+	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
+	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
+	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
+	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
+	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
+	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
+	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
+	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
+	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
+	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
+	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
+	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
+	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
+	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
+	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
+	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
+	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
+	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
+	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
+	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
+	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
+	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
+	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
+	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
+	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
+	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
+	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
+	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
+};
+
+static const sph_u32 T1dn[] = {
+	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
+	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
+	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
+	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
+	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
+	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
+	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
+	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
+	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
+	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
+	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
+	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
+	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
+	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
+	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
+	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
+	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
+	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
+	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
+	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
+	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
+	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
+	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
+	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
+	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
+	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
+	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
+	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
+	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
+	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
+	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
+	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
+	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
+	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
+	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
+	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
+	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
+	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
+	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
+	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
+	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
+	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
+	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
+	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
+	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
+	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
+	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
+	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
+	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
+	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
+	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
+	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
+	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
+	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
+	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
+	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
+	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
+	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
+	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
+	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
+	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
+	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
+	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
+	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
+};
+
+static const sph_u32 T2up[] = {
+	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
+	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
+	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
+	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
+	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
+	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
+	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
+	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
+	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
+	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
+	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
+	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
+	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
+	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
+	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
+	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
+	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
+	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
+	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
+	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
+	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
+	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
+	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
+	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
+	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
+	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
+	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
+	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
+	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
+	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
+	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
+	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
+	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
+	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
+	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
+	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
+	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
+	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
+	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
+	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
+	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
+	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
+	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
+	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
+	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
+	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
+	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
+	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
+	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
+	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
+	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
+	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
+	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
+	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
+	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
+	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
+	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
+	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
+	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
+	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
+	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
+	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
+	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
+	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
+};
+
+static const sph_u32 T2dn[] = {
+	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
+	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
+	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
+	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
+	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
+	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
+	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
+	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
+	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
+	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
+	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
+	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
+	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
+	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
+	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
+	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
+	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
+	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
+	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
+	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
+	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
+	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
+	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
+	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
+	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
+	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
+	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
+	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
+	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
+	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
+	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
+	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
+	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
+	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
+	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
+	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
+	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
+	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
+	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
+	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
+	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
+	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
+	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
+	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
+	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
+	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
+	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
+	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
+	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
+	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
+	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
+	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
+	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
+	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
+	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
+	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
+	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
+	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
+	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
+	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
+	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
+	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
+	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
+	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
+};
+
+static const sph_u32 T3up[] = {
+	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
+	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
+	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
+	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
+	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
+	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
+	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
+	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
+	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
+	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
+	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
+	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
+	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
+	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
+	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
+	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
+	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
+	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
+	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
+	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
+	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
+	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
+	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
+	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
+	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
+	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
+	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
+	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
+	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
+	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
+	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
+	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
+	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
+	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
+	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
+	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
+	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
+	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
+	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
+	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
+	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
+	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
+	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
+	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
+	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
+	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
+	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
+	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
+	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
+	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
+	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
+	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
+	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
+	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
+	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
+	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
+	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
+	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
+	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
+	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
+	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
+	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
+	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
+	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
+};
+
+static const sph_u32 T3dn[] = {
+	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
+	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
+	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
+	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
+	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
+	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
+	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
+	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
+	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
+	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
+	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
+	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
+	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
+	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
+	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
+	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
+	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
+	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
+	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
+	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
+	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
+	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
+	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
+	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
+	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
+	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
+	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
+	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
+	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
+	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
+	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
+	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
+	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
+	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
+	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
+	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
+	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
+	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
+	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
+	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
+	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
+	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
+	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
+	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
+	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
+	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
+	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
+	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
+	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
+	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
+	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
+	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
+	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
+	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
+	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
+	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
+	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
+	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
+	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
+	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
+	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
+	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
+	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
+	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 H[16];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= PC32up(0x00, r); \
+		a[0x1] ^= PC32dn(0x00, r); \
+		a[0x2] ^= PC32up(0x10, r); \
+		a[0x3] ^= PC32dn(0x10, r); \
+		a[0x4] ^= PC32up(0x20, r); \
+		a[0x5] ^= PC32dn(0x20, r); \
+		a[0x6] ^= PC32up(0x30, r); \
+		a[0x7] ^= PC32dn(0x30, r); \
+		a[0x8] ^= PC32up(0x40, r); \
+		a[0x9] ^= PC32dn(0x40, r); \
+		a[0xA] ^= PC32up(0x50, r); \
+		a[0xB] ^= PC32dn(0x50, r); \
+		a[0xC] ^= PC32up(0x60, r); \
+		a[0xD] ^= PC32dn(0x60, r); \
+		a[0xE] ^= PC32up(0x70, r); \
+		a[0xF] ^= PC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= QC32up(0x00, r); \
+		a[0x1] ^= QC32dn(0x00, r); \
+		a[0x2] ^= QC32up(0x10, r); \
+		a[0x3] ^= QC32dn(0x10, r); \
+		a[0x4] ^= QC32up(0x20, r); \
+		a[0x5] ^= QC32dn(0x20, r); \
+		a[0x6] ^= QC32up(0x30, r); \
+		a[0x7] ^= QC32dn(0x30, r); \
+		a[0x8] ^= QC32up(0x40, r); \
+		a[0x9] ^= QC32dn(0x40, r); \
+		a[0xA] ^= QC32up(0x50, r); \
+		a[0xB] ^= QC32dn(0x50, r); \
+		a[0xC] ^= QC32up(0x60, r); \
+		a[0xD] ^= QC32dn(0x60, r); \
+		a[0xE] ^= QC32up(0x70, r); \
+		a[0xF] ^= QC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \
+		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \
+		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \
+		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \
+		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \
+		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \
+		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \
+		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u32 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u32 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u32 H[32];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		sph_u32 fu2 = T0up[B32_2(a[b2])]; \
+		sph_u32 fd2 = T0dn[B32_2(a[b2])]; \
+		sph_u32 fu3 = T1up[B32_3(a[b3])]; \
+		sph_u32 fd3 = T1dn[B32_3(a[b3])]; \
+		sph_u32 fu6 = T0up[B32_2(a[b6])]; \
+		sph_u32 fd6 = T0dn[B32_2(a[b6])]; \
+		sph_u32 fu7 = T1up[B32_3(a[b7])]; \
+		sph_u32 fd7 = T1dn[B32_3(a[b7])]; \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ R32u(fu2, fd2) \
+			^ R32u(fu3, fd3) \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ R32d(fu6, fd6) \
+			^ R32d(fu7, fd7); \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ R32d(fu2, fd2) \
+			^ R32d(fu3, fd3) \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ R32u(fu6, fd6) \
+			^ R32u(fu7, fd7); \
+	} while (0)
+
+#else
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				u + 0x00, (u + 0x02) & 0x1F, \
+				(u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				u + 0x02, (u + 0x04) & 0x1F, \
+				(u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				u + 0x04, (u + 0x06) & 0x1F, \
+				(u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \
+				(u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				u + 0x06, (u + 0x08) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \
+				(u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				(u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \
+				(u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				(u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \
+				(u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				(u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \
+				(u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				(u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \
+				(u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \
+		RBTT(0x02, 0x03, a, \
+			0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \
+		RBTT(0x04, 0x05, a, \
+			0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \
+		RBTT(0x06, 0x07, a, \
+			0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \
+		RBTT(0x08, 0x09, a, \
+			0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \
+		RBTT(0x0E, 0x0F, a, \
+			0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \
+		RBTT(0x10, 0x11, a, \
+			0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \
+		RBTT(0x12, 0x13, a, \
+			0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \
+		RBTT(0x14, 0x15, a, \
+			0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \
+		RBTT(0x16, 0x17, a, \
+			0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \
+		RBTT(0x18, 0x19, a, \
+			0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \
+		RBTT(0x1E, 0x1F, a, \
+			0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \
+		RBTT(0x02, 0x03, a, \
+			0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \
+		RBTT(0x04, 0x05, a, \
+			0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \
+		RBTT(0x06, 0x07, a, \
+			0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \
+		RBTT(0x08, 0x09, a, \
+			0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \
+		RBTT(0x0E, 0x0F, a, \
+			0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \
+		RBTT(0x10, 0x11, a, \
+			0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \
+		RBTT(0x12, 0x13, a, \
+			0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \
+		RBTT(0x14, 0x15, a, \
+			0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \
+		RBTT(0x16, 0x17, a, \
+			0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \
+		RBTT(0x18, 0x19, a, \
+			0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \
+		RBTT(0x1E, 0x1F, a, \
+			0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_P(a, r); \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_BIG   do { \
+		sph_u32 g[32], m[32]; \
+		size_t u; \
+		for (u = 0; u < 32; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		sph_u32 x[32]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#endif
+
+static void
+groestl_small_init(sph_groestl_small_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 7; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[7] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 15; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[15] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_SMALL(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_SMALL;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_small_close(sph_groestl_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[72];
+	size_t u, ptr, pad_len;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 56) {
+		pad_len = 64 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_small_core(sc, pad, pad_len);
+	READ_STATE_SMALL(sc);
+	FINAL_SMALL;
+#if SPH_GROESTL_64
+	for (u = 0; u < 4; u ++)
+		enc64e(pad + (u << 3), H[u + 4]);
+#else
+	for (u = 0; u < 8; u ++)
+		enc32e(pad + (u << 2), H[u + 8]);
+#endif
+	memcpy(dst, pad + 32 - out_len, out_len);
+	groestl_small_init(sc, (unsigned)out_len << 3);
+}
+
+static void
+groestl_big_init(sph_groestl_big_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 15; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[15] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 31; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[31] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_BIG;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_big_close(sph_groestl_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[136];
+	size_t ptr, pad_len, u;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 120) {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 256 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_big_core(sc, pad, pad_len);
+	READ_STATE_BIG(sc);
+	FINAL_BIG;
+#if SPH_GROESTL_64
+	for (u = 0; u < 8; u ++)
+		enc64e(pad + (u << 3), H[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(pad + (u << 2), H[u + 16]);
+#endif
+	memcpy(dst, pad + 64 - out_len, out_len);
+	groestl_big_init(sc, (unsigned)out_len << 3);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_init(void *cc)
+{
+	groestl_small_init(cc, 224);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_init(void *cc)
+{
+	groestl_small_init(cc, 256);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_init(void *cc)
+{
+	groestl_big_init(cc, 384);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_init(void *cc)
+{
+	groestl_big_init(cc, 512);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 64);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 64);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/heavy.cu b/heavy.cu
new file mode 100644
index 0000000..6ce1a66
--- /dev/null
+++ b/heavy.cu
@@ -0,0 +1,416 @@
+#include <string.h>
+#include <openssl/sha.h>
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <map>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+// include thrust
+#include <thrust/version.h>
+#include <thrust/remove.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include "miner.h"
+
+#include "hefty1.h"
+#include "sph_keccak.h"
+#include "sph_blake.h"
+#include "sph_groestl.h"
+
+#include "cuda_hefty1.h"
+#include "cuda_sha256.h"
+#include "cuda_keccak512.h"
+#include "cuda_groestl512.h"
+#include "cuda_blake512.h"
+#include "cuda_combine.h"
+
+extern uint32_t *d_hash2output[8];
+extern uint32_t *d_hash3output[8];
+extern uint32_t *d_hash4output[8];
+extern uint32_t *d_hash5output[8];
+
+#define HEAVYCOIN_BLKHDR_SZ        84
+
+// nonce-array f�r die threads
+uint32_t *d_nonceVector[8];
+
+/* Combines top 64-bits from each hash into a single hash */
+static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
+{
+    const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
+    int bits;
+    unsigned int i;
+    uint32_t mask;
+    unsigned int k;
+
+    /* Transpose first 64 bits of each hash into out */
+    memset(out, 0, 32);
+    bits = 0;
+    for (i = 7; i >= 6; i--) {
+        for (mask = 0x80000000; mask; mask >>= 1) {
+            for (k = 0; k < 4; k++) {
+                out[(255 - bits)/32] <<= 1;
+                if ((hash[k][i] & mask) != 0)
+                    out[(255 - bits)/32] |= 1;
+                bits++;
+            }
+        }
+    }
+}
+
+#ifdef _MSC_VER
+#include <intrin.h>
+static uint32_t __inline bitsset( uint32_t x )
+{
+   DWORD r = 0;
+   _BitScanReverse(&r, x);
+   return r;
+}
+#else
+static uint32_t bitsset( uint32_t x )
+{
+    return 31-__builtin_clz(x);
+}
+#endif
+
+// Finde das high bit in einem Multiword-Integer.
+static int findhighbit(const uint32_t *ptarget, int words)
+{
+    int i;
+    int highbit = 0;
+    for (i=words-1; i >= 0; --i)
+    {
+        if (ptarget[i] != 0) {
+            highbit = i*32 + bitsset(ptarget[i])+1;
+                break;
+        }
+    }
+    return highbit;
+}
+
+// Generiere ein Multiword-Integer das die Zahl
+// (2 << highbit) - 1 repr�sentiert.
+static void genmask(uint32_t *ptarget, int words, int highbit)
+{
+    int i;
+    for (i=words-1; i >= 0; --i)
+    {
+        if ((i+1)*32 <= highbit)
+            ptarget[i] = 0xffffffff;
+        else if (i*32 > highbit)
+            ptarget[i] = 0x00000000;
+        else
+            ptarget[i] = (1 << (highbit-i*32)) - 1;
+    }
+}
+
+struct check_nonce_for_remove
+{    
+    check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
+        m_target(target),
+        m_hashes(hashes),
+        m_hashlen(hashlen),
+        m_startNonce(startNonce) { }
+
+    __device__
+    bool operator()(const uint32_t x)
+    {
+        // Position im Hash Buffer
+        uint32_t hashIndex = x - m_startNonce;
+        // Wert des Hashes (als uint64_t) auslesen.
+        // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
+        uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
+        // gegen das Target pr�fen. Es d�rfen nur Bits aus dem Target gesetzt sein.
+        return (hashValue & m_target) != hashValue;
+    }
+
+    uint64_t  m_target;
+    uint32_t *m_hashes;
+    uint32_t  m_hashlen;
+    uint32_t  m_startNonce;
+};
+
+// Zahl der CUDA Devices im System bestimmen
+extern "C" int cuda_num_devices()
+{
+    int version;
+    cudaError_t err = cudaDriverGetVersion(&version);
+    if (err != cudaSuccess)
+    {
+        applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
+        exit(1);
+    }
+
+    int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
+    if (maj < 5 || (maj == 5 && min < 5))
+    {
+        applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
+        exit(1);
+    }
+
+    int GPU_N;
+    err = cudaGetDeviceCount(&GPU_N);
+    if (err != cudaSuccess)
+    {
+        applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+        exit(1);
+    }
+    return GPU_N;
+}
+
+// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
+typedef struct { double value[8]; } tsumarray;
+cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
+{
+    cudaError_t result = cudaSuccess;
+    if (situation >= 0)
+    {   
+        static std::map<int, tsumarray> tsum;
+
+        double a = 0.95, b = 0.05;
+        if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
+
+        double tsync = 0.0;
+        double tsleep = 0.95 * tsum[situation].value[thr_id];
+        if (cudaStreamQuery(stream) == cudaErrorNotReady)
+        {
+            usleep((useconds_t)(1e6*tsleep));
+            struct timeval tv_start, tv_end;
+            gettimeofday(&tv_start, NULL);
+            result = cudaStreamSynchronize(stream);
+            gettimeofday(&tv_end, NULL);
+            tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
+        }
+        if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
+    }
+    else
+        result = cudaStreamSynchronize(stream);
+    return result;
+}
+
+int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, uint32_t max_nonce,
+ unsigned long *hashes_done, uint32_t maxvote);
+
+extern "C"
+int scanhash_heavy(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, uint32_t max_nonce,
+ unsigned long *hashes_done, uint32_t maxvote)
+{
+ return scanhash_heavy_cpp(thr_id, pdata,
+  ptarget, max_nonce, hashes_done, maxvote);
+}
+
+int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, uint32_t max_nonce,
+ unsigned long *hashes_done, uint32_t maxvote)
+{
+    // CUDA will process thousands of threads.
+    const int throughput = 4096 * 128;
+
+    int rc = 0;
+    uint32_t *hash = NULL;
+    cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
+    uint32_t *cpu_nonceVector = NULL;
+    cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t));
+
+    int nrmCalls[6];
+    memset(nrmCalls, 0, sizeof(int) * 6);
+
+    uint32_t start_nonce = pdata[19];    
+    uint16_t *ext = (uint16_t *)&pdata[20];
+
+    // f�r jeden Hash ein individuelles Target erstellen basierend
+    // auf dem h�chsten Bit, das in ptarget gesetzt ist.
+    int highbit = findhighbit(ptarget, 8);
+    uint32_t target2[2], target3[2], target4[2], target5[2];
+    genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
+    genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
+    genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
+    genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
+
+    static bool init[8] = {0,0,0,0,0,0,0,0};
+    if (!init[thr_id])
+    {
+        hefty_cpu_init(thr_id, throughput);
+        sha256_cpu_init(thr_id, throughput);
+        keccak512_cpu_init(thr_id, throughput);
+        groestl512_cpu_init(thr_id, throughput);
+        blake512_cpu_init(thr_id, throughput);
+        combine_cpu_init(thr_id, throughput);
+        init[thr_id] = true;
+        cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
+    }
+
+
+    if (opt_vote > maxvote) {
+        printf("Warning: Your block reward vote (%hu) exceeds "
+                "the maxvote reported by the pool (%hu).\n",
+                opt_vote, maxvote);
+    }
+
+    if (opt_trust_pool && opt_vote > maxvote) {
+        printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
+        ext[0] = maxvote;
+    }
+    else
+        ext[0] = opt_vote;
+
+    // Setze die Blockdaten
+    hefty_cpu_setBlock(thr_id, throughput, pdata);
+    sha256_cpu_setBlock(pdata);
+    keccak512_cpu_setBlock(pdata);
+    groestl512_cpu_setBlock(pdata);
+    blake512_cpu_setBlock(pdata);
+
+    do {
+        int i;
+
+        ////// Compaction init
+        thrust::device_ptr<uint32_t> devNoncePtr(d_nonceVector[thr_id]);
+        thrust::device_ptr<uint32_t> devNoncePtrEnd((d_nonceVector[thr_id]) + throughput);
+        uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
+
+        hefty_cpu_hash(thr_id, throughput, pdata[19]);
+        //cudaThreadSynchronize();
+        sha256_cpu_hash(thr_id, throughput, pdata[19]);
+        //cudaThreadSynchronize();
+
+        // Hier ist die l�ngste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
+        MyStreamSynchronize(NULL, 0, thr_id);
+
+        ////// Compaction
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19]));
+        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+        if(actualNumberOfValuesInNonceVectorGPU == 0)
+            goto emptyNonceVector;
+        
+        keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+        //cudaThreadSynchronize();
+
+        ////// Compaction
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19]));
+        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+        if(actualNumberOfValuesInNonceVectorGPU == 0)
+            goto emptyNonceVector;
+
+        blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+        //cudaThreadSynchronize();
+
+        ////// Compaction
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19]));
+        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+        if(actualNumberOfValuesInNonceVectorGPU == 0)
+            goto emptyNonceVector;
+
+        groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+        //cudaThreadSynchronize();
+
+        ////// Compaction
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19]));
+        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+        if(actualNumberOfValuesInNonceVectorGPU == 0)
+            goto emptyNonceVector;
+        
+        // combine
+        combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
+
+        // Ergebnisse kopieren
+        if(actualNumberOfValuesInNonceVectorGPU > 0)
+        {
+            cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost);
+        
+            for (i=0; i<actualNumberOfValuesInNonceVectorGPU;++i)
+            {
+                uint32_t nonce = cpu_nonceVector[i];
+                //uint32_t index = nonce - pdata[19];
+                uint32_t index = i;
+                uint32_t *foundhash = &hash[8*index];
+                if (foundhash[7] <= ptarget[7]) {
+                    if (fulltest(foundhash, ptarget)) {
+                        uint32_t verification[8];
+                        pdata[19] += nonce - pdata[19];
+                        heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, HEAVYCOIN_BLKHDR_SZ);
+                        if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
+                            applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
+                        }
+                        else
+                        {
+                            *hashes_done = pdata[19] - start_nonce;
+                            rc = 1;
+                            goto exit;
+                        }
+                    }
+                }
+            }
+        }
+
+emptyNonceVector:
+
+        pdata[19] += throughput;
+
+    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+    *hashes_done = pdata[19] - start_nonce;
+
+exit:
+    cudaFreeHost(cpu_nonceVector);
+    cudaFreeHost(hash);
+    return rc;
+}
+
+void heavycoin_hash(unsigned char* output, const unsigned char* input, int len)
+{
+    unsigned char hash1[32];
+    unsigned char hash2[32];
+    uint32_t hash3[16];
+    uint32_t hash4[16];
+    uint32_t hash5[16];
+    uint32_t *final;
+    SHA256_CTX ctx;
+    sph_keccak512_context keccakCtx;
+    sph_groestl512_context groestlCtx;
+    sph_blake512_context blakeCtx;
+
+    HEFTY1(input, len, hash1);
+
+    /* HEFTY1 is new, so take an extra security measure to eliminate
+     * the possiblity of collisions:
+     *
+     *     Hash(x) = SHA256(x + HEFTY1(x))
+     *
+     * N.B. '+' is concatenation.
+     */
+    SHA256_Init(&ctx);
+    SHA256_Update(&ctx, input, len);
+    SHA256_Update(&ctx, hash1, sizeof(hash1));
+    SHA256_Final(hash2, &ctx);
+
+    /* Additional security: Do not rely on a single cryptographic hash
+     * function.  Instead, combine the outputs of 4 of the most secure
+     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
+     * and BLAKE512.
+     */
+
+    sph_keccak512_init(&keccakCtx);
+    sph_keccak512(&keccakCtx, input, len);
+    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
+    sph_keccak512_close(&keccakCtx, (void *)&hash3);
+
+    sph_groestl512_init(&groestlCtx);
+    sph_groestl512(&groestlCtx, input, len);
+    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
+    sph_groestl512_close(&groestlCtx, (void *)&hash4);
+
+    sph_blake512_init(&blakeCtx);
+    sph_blake512(&blakeCtx, input, len);
+    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
+    sph_blake512_close(&blakeCtx, (void *)&hash5);
+
+    final = (uint32_t *)output;
+    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
+}
diff --git a/hefty1.c b/hefty1.c
new file mode 100644
index 0000000..ca20db8
--- /dev/null
+++ b/hefty1.c
@@ -0,0 +1,371 @@
+/*
+ * HEFTY1 CPU-only cryptographic hash function
+ *
+ * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those
+ * of the authors and should not be interpreted as representing official policies,
+ * either expressed or implied, of the FreeBSD Project.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "hefty1.h"
+
+#ifdef WIN32
+#define inline __inline
+#endif
+
+#define Min(A, B) (A <= B ? A : B)
+
+#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K)                    \
+    {                                                                   \
+        /* To thwart parallelism, Br modifies itself each time it's     \
+         * called.  This also means that calling it in different        \
+         * orders yeilds different results.  In C the order of          \
+         * evaluation of function arguments and + operands are          \
+         * unspecified (and depends on the compiler), so we must make   \
+         * the order of Br calls explicit.                              \
+         */                                                             \
+        uint32_t brG = Br(ctx, G);                                      \
+        uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K;             \
+        uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E));                      \
+        uint32_t brC = Br(ctx, C);                                      \
+        uint32_t brB = Br(ctx, B);                                      \
+        uint32_t tmp3 = Ma(Br(ctx, A), brB, brC);                       \
+        uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A));                      \
+        H = G;                                                          \
+        G = F;                                                          \
+        F = E;                                                          \
+        E = D + Br(ctx, tmp2);                                          \
+        D = C;                                                          \
+        C = B;                                                          \
+        B = A;                                                          \
+        A = tmp2 + tmp4;                                                \
+    }                                                                   \
+
+/* Nothing up my sleeve constants */
+const static uint32_t K[64] = {
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
+    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
+    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
+    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
+    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
+    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
+    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
+    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
+    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
+    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
+    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
+    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
+
+/* Initial hash values */
+const static uint32_t H[HEFTY1_STATE_WORDS] = {
+    0x6a09e667UL,
+    0xbb67ae85UL,
+    0x3c6ef372UL,
+    0xa54ff53aUL,
+    0x510e527fUL,
+    0x9b05688cUL,
+    0x1f83d9abUL,
+    0x5be0cd19UL
+};
+
+static inline uint32_t Rr(uint32_t X, uint8_t n)
+{
+    return (X >> n) | (X << (32 - n));
+}
+
+static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
+{
+    return (E & F) ^ (~E & G);
+}
+
+static inline uint32_t Sigma1(uint32_t E)
+{
+    return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
+}
+
+static inline uint32_t sigma1(uint32_t X)
+{
+    return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
+}
+
+static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
+{
+    return (A & B) ^ (A & C) ^ (B & C);
+}
+
+static inline uint32_t Sigma0(uint32_t A)
+{
+    return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
+}
+
+static inline uint32_t sigma0(uint32_t X)
+{
+    return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
+}
+
+static inline uint32_t Reverse32(uint32_t n)
+{
+    #if BYTE_ORDER == LITTLE_ENDIAN
+        return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
+    #else
+        return n;
+    #endif
+}
+
+static inline uint64_t Reverse64(uint64_t n)
+{
+    #if BYTE_ORDER == LITTLE_ENDIAN
+        uint32_t a = n >> 32;
+        uint32_t b = (n << 32) >> 32;
+
+        return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
+    #else
+        return n;
+    #endif
+}
+
+/* Smoosh byte into nibble */
+static inline uint8_t Smoosh4(uint8_t X)
+{
+    return (X >> 4) ^ (X & 0xf);
+}
+
+/* Smoosh 32-bit word into 2-bits */
+static inline uint8_t Smoosh2(uint32_t X)
+{
+    uint16_t w = (X >> 16) ^ (X & 0xffff);
+    uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
+    return (n >> 2) ^ (n & 0x3);
+}
+#include <stdio.h>
+static void Mangle(uint32_t *S)
+{
+    uint8_t r0 = Smoosh4(S[0] >> 24);
+    uint8_t r1 = Smoosh4(S[0] >> 16);
+    uint8_t r2 = Smoosh4(S[0] >> 8);
+    uint8_t r3 = Smoosh4(S[0] & 0xff);
+
+    /* Diffuse */
+    S[1] ^= Rr(S[0], r0);
+    switch (Smoosh2(S[1])) {
+      case 0: S[2] ^= Rr(S[0], 1 + r0); break;
+      case 1: S[2] += Rr(~S[0], 1 + r1); break;
+      case 2: S[2] &= Rr(~S[0], 1 + r2); break;
+      case 3: S[2] ^= Rr(S[0], 1 + r3); break;
+    }
+    switch (Smoosh2(S[1] ^ S[2])) {
+      case 0: S[3] ^= Rr(S[0], 2 + r0); break;
+      case 1: S[3] += Rr(~S[0], 2 + r1); break;
+      case 2: S[3] &= Rr(~S[0], 2 + r2); break;
+      case 3: S[3] ^= Rr(S[0], 2 + r3); break;
+    }
+
+    /* Compress */
+    S[0] ^= (S[1] ^ S[2]) + S[3];
+}
+
+static void Absorb(uint32_t *S, uint32_t X)
+{
+    uint32_t *R = S;
+    R[0] ^= X;
+    Mangle(S);
+}
+
+static uint32_t Squeeze(uint32_t *S)
+{
+    uint32_t Y = S[0];
+    Mangle(S);
+    return Y;
+}
+
+/* Branch, compress and serialize function */
+static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
+{
+    uint32_t R = Squeeze(ctx->sponge);
+
+    uint8_t r0 = R >> 8;
+    uint8_t r1 = R & 0xff;
+
+    uint32_t Y = 1 << (r0 % 32);
+
+    switch (r1 % 4)
+    {
+    case 0:
+        /* Do nothing */
+        break;
+    case 1:
+        return X & ~Y;
+    case 2:
+        return X | Y;
+    case 3:
+        return X ^ Y;
+    }
+
+    return X;
+}
+
+static void HashBlock(HEFTY1_CTX *ctx)
+{
+    uint32_t A, B, C, D, E, F, G, H;
+    uint32_t W[HEFTY1_BLOCK_BYTES];
+    int t;
+
+    assert(ctx);
+
+    A = ctx->h[0];
+    B = ctx->h[1];
+    C = ctx->h[2];
+    D = ctx->h[3];
+    E = ctx->h[4];
+    F = ctx->h[5];
+    G = ctx->h[6];
+    H = ctx->h[7];
+
+    t = 0;
+    for (; t < 16; t++) {
+        W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
+        Absorb(ctx->sponge, W[t] ^ K[t]);
+    }
+
+    for (t = 0; t < 16; t++) {
+        Absorb(ctx->sponge, D ^ H);
+        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
+    }
+    for (t = 16; t < 64; t++) {
+        Absorb(ctx->sponge, H + D);
+        W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
+        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
+    }
+
+    ctx->h[0] += A;
+    ctx->h[1] += B;
+    ctx->h[2] += C;
+    ctx->h[3] += D;
+    ctx->h[4] += E;
+    ctx->h[5] += F;
+    ctx->h[6] += G;
+    ctx->h[7] += H;
+
+    A = 0;
+    B = 0;
+    C = 0;
+    D = 0;
+    E = 0;
+    F = 0;
+    G = 0;
+    H = 0;
+
+    memset(W, 0, sizeof(W));
+}
+
+/* Public interface */
+
+void HEFTY1_Init(HEFTY1_CTX *ctx)
+{
+    assert(ctx);
+
+    memcpy(ctx->h, H, sizeof(ctx->h));
+    memset(ctx->block, 0, sizeof(ctx->block));
+    ctx->written = 0;
+    memset(ctx->sponge, 0, sizeof(ctx->sponge));
+}
+
+void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
+{
+    uint64_t read;
+    assert(ctx);
+
+    read = 0;
+    while (len) {
+        uint64_t end = ctx->written % HEFTY1_BLOCK_BYTES;
+        uint64_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
+        memcpy(&ctx->block[end], &((unsigned char *)buf)[read], (size_t)count);
+        len -= (size_t)count;
+        read += count;
+        ctx->written += count;
+        if (!(ctx->written % HEFTY1_BLOCK_BYTES))
+            HashBlock(ctx);
+    }
+}
+
+void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
+{
+    uint64_t used;
+    uint64_t *len;
+    int i;
+    assert(digest);
+    assert(ctx);
+
+    /* Pad message (FIPS 180 Section 5.1.1) */
+    used = ctx->written % HEFTY1_BLOCK_BYTES;
+    ctx->block[used++] = 0x80; /* Append 1 to end of message */
+    if (used > HEFTY1_BLOCK_BYTES - 8) {
+        /* We have already written into the last 64bits, so
+         * we must continue into the next block. */
+        memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - (size_t)used);
+        HashBlock(ctx);
+        used = 0; /* Create a new block (below) */
+    }
+
+    /* All remaining bits to zero */
+    memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - (size_t)used);
+
+    /* The last 64bits encode the length (in network byte order) */
+    len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
+    *len = Reverse64(ctx->written*8);
+
+    HashBlock(ctx);
+
+    /* Convert back to network byte order */
+    i = 0;
+    for (; i < HEFTY1_STATE_WORDS; i++)
+        ctx->h[i] = Reverse32(ctx->h[i]);
+
+    memcpy(digest, ctx->h, sizeof(ctx->h));
+    memset(ctx, 0, sizeof(HEFTY1_CTX));
+}
+
+unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
+{
+    HEFTY1_CTX ctx;
+    static unsigned char m[HEFTY1_DIGEST_BYTES];
+
+    if (!digest)
+        digest = m;
+
+    HEFTY1_Init(&ctx);
+    HEFTY1_Update(&ctx, buf, len);
+    HEFTY1_Final(digest, &ctx);
+
+    return digest;
+}
diff --git a/hefty1.h b/hefty1.h
new file mode 100644
index 0000000..29939e8
--- /dev/null
+++ b/hefty1.h
@@ -0,0 +1,66 @@
+/*
+ * HEFTY1 CPU-only cryptographic hash function
+ *
+ * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those
+ * of the authors and should not be interpreted as representing official policies,
+ * either expressed or implied, of the FreeBSD Project.
+ */
+
+#ifndef __HEFTY1_H__
+#define __HEFTY1_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef WIN32
+#include <sys/types.h>
+#endif
+
+#include <inttypes.h>
+
+#define HEFTY1_DIGEST_BYTES 32
+#define HEFTY1_BLOCK_BYTES 64
+#define HEFTY1_STATE_WORDS 8
+#define HEFTY1_SPONGE_WORDS 4
+
+typedef struct HEFTY1_CTX {
+    uint32_t h[HEFTY1_STATE_WORDS];
+    uint8_t  block[HEFTY1_BLOCK_BYTES];
+    uint64_t written;
+    uint32_t sponge[HEFTY1_SPONGE_WORDS];
+} HEFTY1_CTX;
+
+void HEFTY1_Init(HEFTY1_CTX *cxt);
+void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
+void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
+unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __HEFTY1_H__ */
diff --git a/install-sh b/install-sh
new file mode 100644
index 0000000..9c04de2
--- /dev/null
+++ b/install-sh
@@ -0,0 +1,527 @@
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2011-01-19.21; # UTC
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+nl='
+'
+IFS=" ""	$nl"
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit=${DOITPROG-}
+if test -z "$doit"; then
+  doit_exec=exec
+else
+  doit_exec=$doit
+fi
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+
+posix_glob='?'
+initialize_posix_glob='
+  test "$posix_glob" != "?" || {
+    if (set -f) 2>/dev/null; then
+      posix_glob=
+    else
+      posix_glob=:
+    fi
+  }
+'
+
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+
+src=
+dst=
+dir_arg=
+dst_arg=
+
+copy_on_change=false
+no_target_directory=
+
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+
+    -C) copy_on_change=true;;
+
+    -d) dir_arg=true;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+	shift;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+	case $mode in
+	  *' '* | *'	'* | *'
+'*	  | *'*'* | *'?'* | *'['*)
+	    echo "$0: invalid mode: $mode" >&2
+	    exit 1;;
+	esac
+	shift;;
+
+    -o) chowncmd="$chownprog $2"
+	shift;;
+
+    -s) stripcmd=$stripprog;;
+
+    -t) dst_arg=$2
+	# Protect names problematic for `test' and other utilities.
+	case $dst_arg in
+	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
+	esac
+	shift;;
+
+    -T) no_target_directory=true;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --)	shift
+	break;;
+
+    -*)	echo "$0: invalid option: $1" >&2
+	exit 1;;
+
+    *)  break;;
+  esac
+  shift
+done
+
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+    # Protect names problematic for `test' and other utilities.
+    case $dst_arg in
+      -* | [=\(\)!]) dst_arg=./$dst_arg;;
+    esac
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  do_exit='(exit $ret); exit $ret'
+  trap "ret=129; $do_exit" 1
+  trap "ret=130; $do_exit" 2
+  trap "ret=141; $do_exit" 13
+  trap "ret=143; $do_exit" 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names problematic for `test' and other utilities.
+  case $src in
+    -* | [=\(\)!]) src=./$src;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dst_arg: Is a directory" >&2
+	exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      # Prefer dirname, but fall back on a substitute if dirname fails.
+      dstdir=`
+	(dirname "$dst") 2>/dev/null ||
+	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	     X"$dst" : 'X\(//\)[^/]' \| \
+	     X"$dst" : 'X\(//\)$' \| \
+	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
+	echo X"$dst" |
+	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)[^/].*/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\).*/{
+		   s//\1/
+		   q
+		 }
+		 s/.*/./; q'
+      `
+
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+	# Create intermediate dirs using mode 755 as modified by the umask.
+	# This is like FreeBSD 'install' as of 1997-10-28.
+	umask=`umask`
+	case $stripcmd.$umask in
+	  # Optimize common cases.
+	  *[2367][2367]) mkdir_umask=$umask;;
+	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+	  *[0-7])
+	    mkdir_umask=`expr $umask + 22 \
+	      - $umask % 100 % 40 + $umask % 20 \
+	      - $umask % 10 % 4 + $umask % 2
+	    `;;
+	  *) mkdir_umask=$umask,go-w;;
+	esac
+
+	# With -d, create the new directory with the user-specified mode.
+	# Otherwise, rely on $mkdir_umask.
+	if test -n "$dir_arg"; then
+	  mkdir_mode=-m$mode
+	else
+	  mkdir_mode=
+	fi
+
+	posix_mkdir=false
+	case $umask in
+	  *[123567][0-7][0-7])
+	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
+	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+	    ;;
+	  *)
+	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+	    if (umask $mkdir_umask &&
+		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+	    then
+	      if test -z "$dir_arg" || {
+		   # Check for POSIX incompatibilities with -m.
+		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+		   # other-writeable bit of parent directory when it shouldn't.
+		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+		   case $ls_ld_tmpdir in
+		     d????-?r-*) different_mode=700;;
+		     d????-?--*) different_mode=755;;
+		     *) false;;
+		   esac &&
+		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+		   }
+		 }
+	      then posix_mkdir=:
+	      fi
+	      rmdir "$tmpdir/d" "$tmpdir"
+	    else
+	      # Remove any dirs left behind by ancient mkdir implementations.
+	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+	    fi
+	    trap '' 0;;
+	esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+	umask $mkdir_umask &&
+	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+	/*) prefix='/';;
+	[-=\(\)!]*) prefix='./';;
+	*)  prefix='';;
+      esac
+
+      eval "$initialize_posix_glob"
+
+      oIFS=$IFS
+      IFS=/
+      $posix_glob set -f
+      set fnord $dstdir
+      shift
+      $posix_glob set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+	test X"$d" = X && continue
+
+	prefix=$prefix$d
+	if test -d "$prefix"; then
+	  prefixes=
+	else
+	  if $posix_mkdir; then
+	    (umask=$mkdir_umask &&
+	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+	    # Don't fail if two instances are running concurrently.
+	    test -d "$prefix" || exit 1
+	  else
+	    case $prefix in
+	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+	      *) qprefix=$prefix;;
+	    esac
+	    prefixes="$prefixes '$qprefix'"
+	  fi
+	fi
+	prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+	# Don't fail if two instances are running concurrently.
+	(umask $mkdir_umask &&
+	 eval "\$doit_exec \$mkdirprog $prefixes") ||
+	  test -d "$dstdir" || exit 1
+	obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
+
+       eval "$initialize_posix_glob" &&
+       $posix_glob set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       $posix_glob set +f &&
+
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+	# Now remove or move aside any old file at destination location.
+	# We try this two ways since rm can't unlink itself on some
+	# systems and the destination file might be busy for other
+	# reasons.  In this case, the final cleanup might fail but the new
+	# file should still install successfully.
+	{
+	  test ! -f "$dst" ||
+	  $doit $rmcmd -f "$dst" 2>/dev/null ||
+	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+	  } ||
+	  { echo "$0: cannot unlink or rename $dst" >&2
+	    (exit 1); exit 1
+	  }
+	} &&
+
+	# Now rename the file to the real destination.
+	$doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/keccak.c b/keccak.c
new file mode 100644
index 0000000..8c90f3a
--- /dev/null
+++ b/keccak.c
@@ -0,0 +1,1824 @@
+/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
+/*
+ * Keccak implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_keccak.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * Parameters:
+ *
+ *  SPH_KECCAK_64          use a 64-bit type
+ *  SPH_KECCAK_UNROLL      number of loops to unroll (0/undef for full unroll)
+ *  SPH_KECCAK_INTERLEAVE  use bit-interleaving (32-bit type only)
+ *  SPH_KECCAK_NOCOPY      do not copy the state into local variables
+ * 
+ * If there is no usable 64-bit type, the code automatically switches
+ * back to the 32-bit implementation.
+ *
+ * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
+ * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
+ * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
+ * 8 kB L1 code cache), seem to show that the following are optimal:
+ *
+ * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
+ * do not copy the state; unrolling 2, 6 or all rounds also provides
+ * near-optimal performance.
+ * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
+ * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
+ * also provides near-optimal performance.
+ * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
+ * copy the state. Unrolling 4 or 6 rounds is near-optimal.
+ * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
+ * copy the state.
+ * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
+ * the state. Unrolling only 1 round is also near-optimal.
+ *
+ * Also, interleaving does not always yield actual improvements when
+ * using a 32-bit implementation; in particular when the architecture
+ * does not offer a native rotation opcode (interleaving replaces one
+ * 64-bit rotation with two 32-bit rotations, which is a gain only if
+ * there is a native 32-bit rotation opcode and not a native 64-bit
+ * rotation opcode; also, interleaving implies a small overhead when
+ * processing input words).
+ *
+ * To sum up:
+ * -- when possible, use the 64-bit code
+ * -- exception: on 32-bit x86, use 32-bit code
+ * -- when using 32-bit code, use interleaving
+ * -- copy the state, except on x86
+ * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
+ */
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_SMALL_FOOTPRINT_KECCAK   1
+#endif
+
+/*
+ * By default, we select the 64-bit implementation if a 64-bit type
+ * is available, unless a 32-bit x86 is detected.
+ */
+#if !defined SPH_KECCAK_64 && SPH_64 \
+	&& !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC)
+#define SPH_KECCAK_64   1
+#endif
+
+/*
+ * If using a 32-bit implementation, we prefer to interleave.
+ */
+#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE
+#define SPH_KECCAK_INTERLEAVE   1
+#endif
+
+/*
+ * Unroll 8 rounds on big systems, 2 rounds on small systems.
+ */
+#ifndef SPH_KECCAK_UNROLL
+#if SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_KECCAK_UNROLL   2
+#else
+#define SPH_KECCAK_UNROLL   8
+#endif
+#endif
+
+/*
+ * We do not want to copy the state to local variables on x86 (32-bit
+ * and 64-bit alike).
+ */
+#ifndef SPH_KECCAK_NOCOPY
+#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC
+#define SPH_KECCAK_NOCOPY   1
+#else
+#define SPH_KECCAK_NOCOPY   0
+#endif
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_KECCAK_64
+
+static const sph_u64 RC[] = {
+	SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+	SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+	SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+	SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+	SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+	SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+	SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+	SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+	SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+	SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00   (kc->u.wide[ 0])
+#define a10   (kc->u.wide[ 1])
+#define a20   (kc->u.wide[ 2])
+#define a30   (kc->u.wide[ 3])
+#define a40   (kc->u.wide[ 4])
+#define a01   (kc->u.wide[ 5])
+#define a11   (kc->u.wide[ 6])
+#define a21   (kc->u.wide[ 7])
+#define a31   (kc->u.wide[ 8])
+#define a41   (kc->u.wide[ 9])
+#define a02   (kc->u.wide[10])
+#define a12   (kc->u.wide[11])
+#define a22   (kc->u.wide[12])
+#define a32   (kc->u.wide[13])
+#define a42   (kc->u.wide[14])
+#define a03   (kc->u.wide[15])
+#define a13   (kc->u.wide[16])
+#define a23   (kc->u.wide[17])
+#define a33   (kc->u.wide[18])
+#define a43   (kc->u.wide[19])
+#define a04   (kc->u.wide[20])
+#define a14   (kc->u.wide[21])
+#define a24   (kc->u.wide[22])
+#define a34   (kc->u.wide[23])
+#define a44   (kc->u.wide[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u64 a00, a01, a02, a03, a04; \
+	sph_u64 a10, a11, a12, a13, a14; \
+	sph_u64 a20, a21, a22, a23, a24; \
+	sph_u64 a30, a31, a32, a33, a34; \
+	sph_u64 a40, a41, a42, a43, a44;
+
+#define READ_STATE(state)   do { \
+		a00 = (state)->u.wide[ 0]; \
+		a10 = (state)->u.wide[ 1]; \
+		a20 = (state)->u.wide[ 2]; \
+		a30 = (state)->u.wide[ 3]; \
+		a40 = (state)->u.wide[ 4]; \
+		a01 = (state)->u.wide[ 5]; \
+		a11 = (state)->u.wide[ 6]; \
+		a21 = (state)->u.wide[ 7]; \
+		a31 = (state)->u.wide[ 8]; \
+		a41 = (state)->u.wide[ 9]; \
+		a02 = (state)->u.wide[10]; \
+		a12 = (state)->u.wide[11]; \
+		a22 = (state)->u.wide[12]; \
+		a32 = (state)->u.wide[13]; \
+		a42 = (state)->u.wide[14]; \
+		a03 = (state)->u.wide[15]; \
+		a13 = (state)->u.wide[16]; \
+		a23 = (state)->u.wide[17]; \
+		a33 = (state)->u.wide[18]; \
+		a43 = (state)->u.wide[19]; \
+		a04 = (state)->u.wide[20]; \
+		a14 = (state)->u.wide[21]; \
+		a24 = (state)->u.wide[22]; \
+		a34 = (state)->u.wide[23]; \
+		a44 = (state)->u.wide[24]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.wide[ 0] = a00; \
+		(state)->u.wide[ 1] = a10; \
+		(state)->u.wide[ 2] = a20; \
+		(state)->u.wide[ 3] = a30; \
+		(state)->u.wide[ 4] = a40; \
+		(state)->u.wide[ 5] = a01; \
+		(state)->u.wide[ 6] = a11; \
+		(state)->u.wide[ 7] = a21; \
+		(state)->u.wide[ 8] = a31; \
+		(state)->u.wide[ 9] = a41; \
+		(state)->u.wide[10] = a02; \
+		(state)->u.wide[11] = a12; \
+		(state)->u.wide[12] = a22; \
+		(state)->u.wide[13] = a32; \
+		(state)->u.wide[14] = a42; \
+		(state)->u.wide[15] = a03; \
+		(state)->u.wide[16] = a13; \
+		(state)->u.wide[17] = a23; \
+		(state)->u.wide[18] = a33; \
+		(state)->u.wide[19] = a43; \
+		(state)->u.wide[20] = a04; \
+		(state)->u.wide[21] = a14; \
+		(state)->u.wide[22] = a24; \
+		(state)->u.wide[23] = a34; \
+		(state)->u.wide[24] = a44; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		if ((lim) == 72) \
+			break; \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		if ((lim) == 104) \
+			break; \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		if ((lim) == 136) \
+			break; \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = a ^ b)
+#define AND64(d, a, b)   (d = a & b)
+#define OR64(d, a, b)    (d = a | b)
+#define NOT64(d, s)      (d = SPH_T64(~s))
+#define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
+#define XOR64_IOTA       XOR64
+
+#else
+
+static const struct {
+	sph_u32 high, low;
+} RC[] = {
+#if SPH_KECCAK_INTERLEAVE
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000089), SPH_C32(0x00000000) },
+	{ SPH_C32(0x8000008B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008080), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000008B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x0000000B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000000A), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000808B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000000B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000008A), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80000008), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00000083), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000088), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008082), SPH_C32(0x00000000) }
+#else
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00008082) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000808A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008000) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000808B) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000008A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00000088) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000808B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000008B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008089) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008003) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008002) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00000080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000800A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008008) }
+#endif
+};
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define INTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#define UNINTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#else
+
+#define INTERLEAVE(l, h)
+#define UNINTERLEAVE(l, h)
+
+#endif
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00l   (kc->u.narrow[2 *  0 + 0])
+#define a00h   (kc->u.narrow[2 *  0 + 1])
+#define a10l   (kc->u.narrow[2 *  1 + 0])
+#define a10h   (kc->u.narrow[2 *  1 + 1])
+#define a20l   (kc->u.narrow[2 *  2 + 0])
+#define a20h   (kc->u.narrow[2 *  2 + 1])
+#define a30l   (kc->u.narrow[2 *  3 + 0])
+#define a30h   (kc->u.narrow[2 *  3 + 1])
+#define a40l   (kc->u.narrow[2 *  4 + 0])
+#define a40h   (kc->u.narrow[2 *  4 + 1])
+#define a01l   (kc->u.narrow[2 *  5 + 0])
+#define a01h   (kc->u.narrow[2 *  5 + 1])
+#define a11l   (kc->u.narrow[2 *  6 + 0])
+#define a11h   (kc->u.narrow[2 *  6 + 1])
+#define a21l   (kc->u.narrow[2 *  7 + 0])
+#define a21h   (kc->u.narrow[2 *  7 + 1])
+#define a31l   (kc->u.narrow[2 *  8 + 0])
+#define a31h   (kc->u.narrow[2 *  8 + 1])
+#define a41l   (kc->u.narrow[2 *  9 + 0])
+#define a41h   (kc->u.narrow[2 *  9 + 1])
+#define a02l   (kc->u.narrow[2 * 10 + 0])
+#define a02h   (kc->u.narrow[2 * 10 + 1])
+#define a12l   (kc->u.narrow[2 * 11 + 0])
+#define a12h   (kc->u.narrow[2 * 11 + 1])
+#define a22l   (kc->u.narrow[2 * 12 + 0])
+#define a22h   (kc->u.narrow[2 * 12 + 1])
+#define a32l   (kc->u.narrow[2 * 13 + 0])
+#define a32h   (kc->u.narrow[2 * 13 + 1])
+#define a42l   (kc->u.narrow[2 * 14 + 0])
+#define a42h   (kc->u.narrow[2 * 14 + 1])
+#define a03l   (kc->u.narrow[2 * 15 + 0])
+#define a03h   (kc->u.narrow[2 * 15 + 1])
+#define a13l   (kc->u.narrow[2 * 16 + 0])
+#define a13h   (kc->u.narrow[2 * 16 + 1])
+#define a23l   (kc->u.narrow[2 * 17 + 0])
+#define a23h   (kc->u.narrow[2 * 17 + 1])
+#define a33l   (kc->u.narrow[2 * 18 + 0])
+#define a33h   (kc->u.narrow[2 * 18 + 1])
+#define a43l   (kc->u.narrow[2 * 19 + 0])
+#define a43h   (kc->u.narrow[2 * 19 + 1])
+#define a04l   (kc->u.narrow[2 * 20 + 0])
+#define a04h   (kc->u.narrow[2 * 20 + 1])
+#define a14l   (kc->u.narrow[2 * 21 + 0])
+#define a14h   (kc->u.narrow[2 * 21 + 1])
+#define a24l   (kc->u.narrow[2 * 22 + 0])
+#define a24h   (kc->u.narrow[2 * 22 + 1])
+#define a34l   (kc->u.narrow[2 * 23 + 0])
+#define a34h   (kc->u.narrow[2 * 23 + 1])
+#define a44l   (kc->u.narrow[2 * 24 + 0])
+#define a44h   (kc->u.narrow[2 * 24 + 1])
+
+#define DECL_STATE
+#define READ_STATE(state)
+#define WRITE_STATE(state)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			sph_u32 tl, th; \
+			tl = sph_dec32le_aligned(buf + j + 0); \
+			th = sph_dec32le_aligned(buf + j + 4); \
+			INTERLEAVE(tl, th); \
+			kc->u.narrow[(j >> 2) + 0] ^= tl; \
+			kc->u.narrow[(j >> 2) + 1] ^= th; \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \
+	sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \
+	sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \
+	sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \
+	sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h;
+
+#define READ_STATE(state)   do { \
+		a00l = (state)->u.narrow[2 *  0 + 0]; \
+		a00h = (state)->u.narrow[2 *  0 + 1]; \
+		a10l = (state)->u.narrow[2 *  1 + 0]; \
+		a10h = (state)->u.narrow[2 *  1 + 1]; \
+		a20l = (state)->u.narrow[2 *  2 + 0]; \
+		a20h = (state)->u.narrow[2 *  2 + 1]; \
+		a30l = (state)->u.narrow[2 *  3 + 0]; \
+		a30h = (state)->u.narrow[2 *  3 + 1]; \
+		a40l = (state)->u.narrow[2 *  4 + 0]; \
+		a40h = (state)->u.narrow[2 *  4 + 1]; \
+		a01l = (state)->u.narrow[2 *  5 + 0]; \
+		a01h = (state)->u.narrow[2 *  5 + 1]; \
+		a11l = (state)->u.narrow[2 *  6 + 0]; \
+		a11h = (state)->u.narrow[2 *  6 + 1]; \
+		a21l = (state)->u.narrow[2 *  7 + 0]; \
+		a21h = (state)->u.narrow[2 *  7 + 1]; \
+		a31l = (state)->u.narrow[2 *  8 + 0]; \
+		a31h = (state)->u.narrow[2 *  8 + 1]; \
+		a41l = (state)->u.narrow[2 *  9 + 0]; \
+		a41h = (state)->u.narrow[2 *  9 + 1]; \
+		a02l = (state)->u.narrow[2 * 10 + 0]; \
+		a02h = (state)->u.narrow[2 * 10 + 1]; \
+		a12l = (state)->u.narrow[2 * 11 + 0]; \
+		a12h = (state)->u.narrow[2 * 11 + 1]; \
+		a22l = (state)->u.narrow[2 * 12 + 0]; \
+		a22h = (state)->u.narrow[2 * 12 + 1]; \
+		a32l = (state)->u.narrow[2 * 13 + 0]; \
+		a32h = (state)->u.narrow[2 * 13 + 1]; \
+		a42l = (state)->u.narrow[2 * 14 + 0]; \
+		a42h = (state)->u.narrow[2 * 14 + 1]; \
+		a03l = (state)->u.narrow[2 * 15 + 0]; \
+		a03h = (state)->u.narrow[2 * 15 + 1]; \
+		a13l = (state)->u.narrow[2 * 16 + 0]; \
+		a13h = (state)->u.narrow[2 * 16 + 1]; \
+		a23l = (state)->u.narrow[2 * 17 + 0]; \
+		a23h = (state)->u.narrow[2 * 17 + 1]; \
+		a33l = (state)->u.narrow[2 * 18 + 0]; \
+		a33h = (state)->u.narrow[2 * 18 + 1]; \
+		a43l = (state)->u.narrow[2 * 19 + 0]; \
+		a43h = (state)->u.narrow[2 * 19 + 1]; \
+		a04l = (state)->u.narrow[2 * 20 + 0]; \
+		a04h = (state)->u.narrow[2 * 20 + 1]; \
+		a14l = (state)->u.narrow[2 * 21 + 0]; \
+		a14h = (state)->u.narrow[2 * 21 + 1]; \
+		a24l = (state)->u.narrow[2 * 22 + 0]; \
+		a24h = (state)->u.narrow[2 * 22 + 1]; \
+		a34l = (state)->u.narrow[2 * 23 + 0]; \
+		a34h = (state)->u.narrow[2 * 23 + 1]; \
+		a44l = (state)->u.narrow[2 * 24 + 0]; \
+		a44h = (state)->u.narrow[2 * 24 + 1]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.narrow[2 *  0 + 0] = a00l; \
+		(state)->u.narrow[2 *  0 + 1] = a00h; \
+		(state)->u.narrow[2 *  1 + 0] = a10l; \
+		(state)->u.narrow[2 *  1 + 1] = a10h; \
+		(state)->u.narrow[2 *  2 + 0] = a20l; \
+		(state)->u.narrow[2 *  2 + 1] = a20h; \
+		(state)->u.narrow[2 *  3 + 0] = a30l; \
+		(state)->u.narrow[2 *  3 + 1] = a30h; \
+		(state)->u.narrow[2 *  4 + 0] = a40l; \
+		(state)->u.narrow[2 *  4 + 1] = a40h; \
+		(state)->u.narrow[2 *  5 + 0] = a01l; \
+		(state)->u.narrow[2 *  5 + 1] = a01h; \
+		(state)->u.narrow[2 *  6 + 0] = a11l; \
+		(state)->u.narrow[2 *  6 + 1] = a11h; \
+		(state)->u.narrow[2 *  7 + 0] = a21l; \
+		(state)->u.narrow[2 *  7 + 1] = a21h; \
+		(state)->u.narrow[2 *  8 + 0] = a31l; \
+		(state)->u.narrow[2 *  8 + 1] = a31h; \
+		(state)->u.narrow[2 *  9 + 0] = a41l; \
+		(state)->u.narrow[2 *  9 + 1] = a41h; \
+		(state)->u.narrow[2 * 10 + 0] = a02l; \
+		(state)->u.narrow[2 * 10 + 1] = a02h; \
+		(state)->u.narrow[2 * 11 + 0] = a12l; \
+		(state)->u.narrow[2 * 11 + 1] = a12h; \
+		(state)->u.narrow[2 * 12 + 0] = a22l; \
+		(state)->u.narrow[2 * 12 + 1] = a22h; \
+		(state)->u.narrow[2 * 13 + 0] = a32l; \
+		(state)->u.narrow[2 * 13 + 1] = a32h; \
+		(state)->u.narrow[2 * 14 + 0] = a42l; \
+		(state)->u.narrow[2 * 14 + 1] = a42h; \
+		(state)->u.narrow[2 * 15 + 0] = a03l; \
+		(state)->u.narrow[2 * 15 + 1] = a03h; \
+		(state)->u.narrow[2 * 16 + 0] = a13l; \
+		(state)->u.narrow[2 * 16 + 1] = a13h; \
+		(state)->u.narrow[2 * 17 + 0] = a23l; \
+		(state)->u.narrow[2 * 17 + 1] = a23h; \
+		(state)->u.narrow[2 * 18 + 0] = a33l; \
+		(state)->u.narrow[2 * 18 + 1] = a33h; \
+		(state)->u.narrow[2 * 19 + 0] = a43l; \
+		(state)->u.narrow[2 * 19 + 1] = a43h; \
+		(state)->u.narrow[2 * 20 + 0] = a04l; \
+		(state)->u.narrow[2 * 20 + 1] = a04h; \
+		(state)->u.narrow[2 * 21 + 0] = a14l; \
+		(state)->u.narrow[2 * 21 + 1] = a14h; \
+		(state)->u.narrow[2 * 22 + 0] = a24l; \
+		(state)->u.narrow[2 * 22 + 1] = a24h; \
+		(state)->u.narrow[2 * 23 + 0] = a34l; \
+		(state)->u.narrow[2 * 23 + 1] = a34h; \
+		(state)->u.narrow[2 * 24 + 0] = a44l; \
+		(state)->u.narrow[2 * 24 + 1] = a44h; \
+	} while (0)
+
+#define READ64(d, off)   do { \
+		sph_u32 tl, th; \
+		tl = sph_dec32le_aligned(buf + (off)); \
+		th = sph_dec32le_aligned(buf + (off) + 4); \
+		INTERLEAVE(tl, th); \
+		d ## l ^= tl; \
+		d ## h ^= th; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		READ64(a23, 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		if ((lim) == 72) \
+			break; \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		if ((lim) == 104) \
+			break; \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		if ((lim) == 136) \
+			break; \
+		READ64(a23, 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x ## l, x ## h
+#define MOV64(d, s)      (d ## l = s ## l, d ## h = s ## h)
+#define XOR64(d, a, b)   (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h)
+#define AND64(d, a, b)   (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h)
+#define OR64(d, a, b)    (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h)
+#define NOT64(d, s)      (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h))
+#define ROL64(d, v, n)   ROL64_ ## n(d, v)
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define ROL64_odd1(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd63(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \
+		d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_even(d, v, n)   do { \
+		d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+	} while (0)
+
+#define ROL64_0(d, v)
+#define ROL64_1(d, v)    ROL64_odd1(d, v)
+#define ROL64_2(d, v)    ROL64_even(d, v,  1)
+#define ROL64_3(d, v)    ROL64_odd( d, v,  2)
+#define ROL64_4(d, v)    ROL64_even(d, v,  2)
+#define ROL64_5(d, v)    ROL64_odd( d, v,  3)
+#define ROL64_6(d, v)    ROL64_even(d, v,  3)
+#define ROL64_7(d, v)    ROL64_odd( d, v,  4)
+#define ROL64_8(d, v)    ROL64_even(d, v,  4)
+#define ROL64_9(d, v)    ROL64_odd( d, v,  5)
+#define ROL64_10(d, v)   ROL64_even(d, v,  5)
+#define ROL64_11(d, v)   ROL64_odd( d, v,  6)
+#define ROL64_12(d, v)   ROL64_even(d, v,  6)
+#define ROL64_13(d, v)   ROL64_odd( d, v,  7)
+#define ROL64_14(d, v)   ROL64_even(d, v,  7)
+#define ROL64_15(d, v)   ROL64_odd( d, v,  8)
+#define ROL64_16(d, v)   ROL64_even(d, v,  8)
+#define ROL64_17(d, v)   ROL64_odd( d, v,  9)
+#define ROL64_18(d, v)   ROL64_even(d, v,  9)
+#define ROL64_19(d, v)   ROL64_odd( d, v, 10)
+#define ROL64_20(d, v)   ROL64_even(d, v, 10)
+#define ROL64_21(d, v)   ROL64_odd( d, v, 11)
+#define ROL64_22(d, v)   ROL64_even(d, v, 11)
+#define ROL64_23(d, v)   ROL64_odd( d, v, 12)
+#define ROL64_24(d, v)   ROL64_even(d, v, 12)
+#define ROL64_25(d, v)   ROL64_odd( d, v, 13)
+#define ROL64_26(d, v)   ROL64_even(d, v, 13)
+#define ROL64_27(d, v)   ROL64_odd( d, v, 14)
+#define ROL64_28(d, v)   ROL64_even(d, v, 14)
+#define ROL64_29(d, v)   ROL64_odd( d, v, 15)
+#define ROL64_30(d, v)   ROL64_even(d, v, 15)
+#define ROL64_31(d, v)   ROL64_odd( d, v, 16)
+#define ROL64_32(d, v)   ROL64_even(d, v, 16)
+#define ROL64_33(d, v)   ROL64_odd( d, v, 17)
+#define ROL64_34(d, v)   ROL64_even(d, v, 17)
+#define ROL64_35(d, v)   ROL64_odd( d, v, 18)
+#define ROL64_36(d, v)   ROL64_even(d, v, 18)
+#define ROL64_37(d, v)   ROL64_odd( d, v, 19)
+#define ROL64_38(d, v)   ROL64_even(d, v, 19)
+#define ROL64_39(d, v)   ROL64_odd( d, v, 20)
+#define ROL64_40(d, v)   ROL64_even(d, v, 20)
+#define ROL64_41(d, v)   ROL64_odd( d, v, 21)
+#define ROL64_42(d, v)   ROL64_even(d, v, 21)
+#define ROL64_43(d, v)   ROL64_odd( d, v, 22)
+#define ROL64_44(d, v)   ROL64_even(d, v, 22)
+#define ROL64_45(d, v)   ROL64_odd( d, v, 23)
+#define ROL64_46(d, v)   ROL64_even(d, v, 23)
+#define ROL64_47(d, v)   ROL64_odd( d, v, 24)
+#define ROL64_48(d, v)   ROL64_even(d, v, 24)
+#define ROL64_49(d, v)   ROL64_odd( d, v, 25)
+#define ROL64_50(d, v)   ROL64_even(d, v, 25)
+#define ROL64_51(d, v)   ROL64_odd( d, v, 26)
+#define ROL64_52(d, v)   ROL64_even(d, v, 26)
+#define ROL64_53(d, v)   ROL64_odd( d, v, 27)
+#define ROL64_54(d, v)   ROL64_even(d, v, 27)
+#define ROL64_55(d, v)   ROL64_odd( d, v, 28)
+#define ROL64_56(d, v)   ROL64_even(d, v, 28)
+#define ROL64_57(d, v)   ROL64_odd( d, v, 29)
+#define ROL64_58(d, v)   ROL64_even(d, v, 29)
+#define ROL64_59(d, v)   ROL64_odd( d, v, 30)
+#define ROL64_60(d, v)   ROL64_even(d, v, 30)
+#define ROL64_61(d, v)   ROL64_odd( d, v, 31)
+#define ROL64_62(d, v)   ROL64_even(d, v, 31)
+#define ROL64_63(d, v)   ROL64_odd63(d, v)
+
+#else
+
+#define ROL64_small(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \
+		d ## l = tmp; \
+	} while (0)
+
+#define ROL64_0(d, v)    0
+#define ROL64_1(d, v)    ROL64_small(d, v, 1)
+#define ROL64_2(d, v)    ROL64_small(d, v, 2)
+#define ROL64_3(d, v)    ROL64_small(d, v, 3)
+#define ROL64_4(d, v)    ROL64_small(d, v, 4)
+#define ROL64_5(d, v)    ROL64_small(d, v, 5)
+#define ROL64_6(d, v)    ROL64_small(d, v, 6)
+#define ROL64_7(d, v)    ROL64_small(d, v, 7)
+#define ROL64_8(d, v)    ROL64_small(d, v, 8)
+#define ROL64_9(d, v)    ROL64_small(d, v, 9)
+#define ROL64_10(d, v)   ROL64_small(d, v, 10)
+#define ROL64_11(d, v)   ROL64_small(d, v, 11)
+#define ROL64_12(d, v)   ROL64_small(d, v, 12)
+#define ROL64_13(d, v)   ROL64_small(d, v, 13)
+#define ROL64_14(d, v)   ROL64_small(d, v, 14)
+#define ROL64_15(d, v)   ROL64_small(d, v, 15)
+#define ROL64_16(d, v)   ROL64_small(d, v, 16)
+#define ROL64_17(d, v)   ROL64_small(d, v, 17)
+#define ROL64_18(d, v)   ROL64_small(d, v, 18)
+#define ROL64_19(d, v)   ROL64_small(d, v, 19)
+#define ROL64_20(d, v)   ROL64_small(d, v, 20)
+#define ROL64_21(d, v)   ROL64_small(d, v, 21)
+#define ROL64_22(d, v)   ROL64_small(d, v, 22)
+#define ROL64_23(d, v)   ROL64_small(d, v, 23)
+#define ROL64_24(d, v)   ROL64_small(d, v, 24)
+#define ROL64_25(d, v)   ROL64_small(d, v, 25)
+#define ROL64_26(d, v)   ROL64_small(d, v, 26)
+#define ROL64_27(d, v)   ROL64_small(d, v, 27)
+#define ROL64_28(d, v)   ROL64_small(d, v, 28)
+#define ROL64_29(d, v)   ROL64_small(d, v, 29)
+#define ROL64_30(d, v)   ROL64_small(d, v, 30)
+#define ROL64_31(d, v)   ROL64_small(d, v, 31)
+
+#define ROL64_32(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_big(d, v, n)   do { \
+		sph_u32 trl, trh; \
+		ROL64_small(tr, v, n); \
+		d ## h = trl; \
+		d ## l = trh; \
+	} while (0)
+
+#define ROL64_33(d, v)   ROL64_big(d, v, 1)
+#define ROL64_34(d, v)   ROL64_big(d, v, 2)
+#define ROL64_35(d, v)   ROL64_big(d, v, 3)
+#define ROL64_36(d, v)   ROL64_big(d, v, 4)
+#define ROL64_37(d, v)   ROL64_big(d, v, 5)
+#define ROL64_38(d, v)   ROL64_big(d, v, 6)
+#define ROL64_39(d, v)   ROL64_big(d, v, 7)
+#define ROL64_40(d, v)   ROL64_big(d, v, 8)
+#define ROL64_41(d, v)   ROL64_big(d, v, 9)
+#define ROL64_42(d, v)   ROL64_big(d, v, 10)
+#define ROL64_43(d, v)   ROL64_big(d, v, 11)
+#define ROL64_44(d, v)   ROL64_big(d, v, 12)
+#define ROL64_45(d, v)   ROL64_big(d, v, 13)
+#define ROL64_46(d, v)   ROL64_big(d, v, 14)
+#define ROL64_47(d, v)   ROL64_big(d, v, 15)
+#define ROL64_48(d, v)   ROL64_big(d, v, 16)
+#define ROL64_49(d, v)   ROL64_big(d, v, 17)
+#define ROL64_50(d, v)   ROL64_big(d, v, 18)
+#define ROL64_51(d, v)   ROL64_big(d, v, 19)
+#define ROL64_52(d, v)   ROL64_big(d, v, 20)
+#define ROL64_53(d, v)   ROL64_big(d, v, 21)
+#define ROL64_54(d, v)   ROL64_big(d, v, 22)
+#define ROL64_55(d, v)   ROL64_big(d, v, 23)
+#define ROL64_56(d, v)   ROL64_big(d, v, 24)
+#define ROL64_57(d, v)   ROL64_big(d, v, 25)
+#define ROL64_58(d, v)   ROL64_big(d, v, 26)
+#define ROL64_59(d, v)   ROL64_big(d, v, 27)
+#define ROL64_60(d, v)   ROL64_big(d, v, 28)
+#define ROL64_61(d, v)   ROL64_big(d, v, 29)
+#define ROL64_62(d, v)   ROL64_big(d, v, 30)
+#define ROL64_63(d, v)   ROL64_big(d, v, 31)
+
+#endif
+
+#define XOR64_IOTA(d, s, k) \
+	(d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high)
+
+#endif
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+		DECL64(tt0); \
+		DECL64(tt1); \
+		DECL64(tt2); \
+		DECL64(tt3); \
+		XOR64(tt0, d0, d1); \
+		XOR64(tt1, d2, d3); \
+		XOR64(tt0, tt0, d4); \
+		XOR64(tt0, tt0, tt1); \
+		ROL64(tt0, tt0, 1); \
+		XOR64(tt2, c0, c1); \
+		XOR64(tt3, c2, c3); \
+		XOR64(tt0, tt0, c4); \
+		XOR64(tt2, tt2, tt3); \
+		XOR64(t, tt0, tt2); \
+	} while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(t0); \
+		DECL64(t1); \
+		DECL64(t2); \
+		DECL64(t3); \
+		DECL64(t4); \
+		TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+		TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+		TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+		TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+		TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+		XOR64(b00, b00, t0); \
+		XOR64(b01, b01, t0); \
+		XOR64(b02, b02, t0); \
+		XOR64(b03, b03, t0); \
+		XOR64(b04, b04, t0); \
+		XOR64(b10, b10, t1); \
+		XOR64(b11, b11, t1); \
+		XOR64(b12, b12, t1); \
+		XOR64(b13, b13, t1); \
+		XOR64(b14, b14, t1); \
+		XOR64(b20, b20, t2); \
+		XOR64(b21, b21, t2); \
+		XOR64(b22, b22, t2); \
+		XOR64(b23, b23, t2); \
+		XOR64(b24, b24, t2); \
+		XOR64(b30, b30, t3); \
+		XOR64(b31, b31, t3); \
+		XOR64(b32, b32, t3); \
+		XOR64(b33, b33, t3); \
+		XOR64(b34, b34, t3); \
+		XOR64(b40, b40, t4); \
+		XOR64(b41, b41, t4); \
+		XOR64(b42, b42, t4); \
+		XOR64(b43, b43, t4); \
+		XOR64(b44, b44, t4); \
+	} while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		/* ROL64(b00, b00,  0); */ \
+		ROL64(b01, b01, 36); \
+		ROL64(b02, b02,  3); \
+		ROL64(b03, b03, 41); \
+		ROL64(b04, b04, 18); \
+		ROL64(b10, b10,  1); \
+		ROL64(b11, b11, 44); \
+		ROL64(b12, b12, 10); \
+		ROL64(b13, b13, 45); \
+		ROL64(b14, b14,  2); \
+		ROL64(b20, b20, 62); \
+		ROL64(b21, b21,  6); \
+		ROL64(b22, b22, 43); \
+		ROL64(b23, b23, 15); \
+		ROL64(b24, b24, 61); \
+		ROL64(b30, b30, 28); \
+		ROL64(b31, b31, 55); \
+		ROL64(b32, b32, 25); \
+		ROL64(b33, b33, 21); \
+		ROL64(b34, b34, 56); \
+		ROL64(b40, b40, 27); \
+		ROL64(b41, b41, 20); \
+		ROL64(b42, b42, 39); \
+		ROL64(b43, b43,  8); \
+		ROL64(b44, b44, 14); \
+	} while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+		DECL64(kt); \
+		OR64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+		DECL64(kt); \
+		AND64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(c0); \
+		DECL64(c1); \
+		DECL64(c2); \
+		DECL64(c3); \
+		DECL64(c4); \
+		DECL64(bnn); \
+		NOT64(bnn, b20); \
+		KHI_XO(c0, b00, b10, b20); \
+		KHI_XO(c1, b10, bnn, b30); \
+		KHI_XA(c2, b20, b30, b40); \
+		KHI_XO(c3, b30, b40, b00); \
+		KHI_XA(c4, b40, b00, b10); \
+		MOV64(b00, c0); \
+		MOV64(b10, c1); \
+		MOV64(b20, c2); \
+		MOV64(b30, c3); \
+		MOV64(b40, c4); \
+		NOT64(bnn, b41); \
+		KHI_XO(c0, b01, b11, b21); \
+		KHI_XA(c1, b11, b21, b31); \
+		KHI_XO(c2, b21, b31, bnn); \
+		KHI_XO(c3, b31, b41, b01); \
+		KHI_XA(c4, b41, b01, b11); \
+		MOV64(b01, c0); \
+		MOV64(b11, c1); \
+		MOV64(b21, c2); \
+		MOV64(b31, c3); \
+		MOV64(b41, c4); \
+		NOT64(bnn, b32); \
+		KHI_XO(c0, b02, b12, b22); \
+		KHI_XA(c1, b12, b22, b32); \
+		KHI_XA(c2, b22, bnn, b42); \
+		KHI_XO(c3, bnn, b42, b02); \
+		KHI_XA(c4, b42, b02, b12); \
+		MOV64(b02, c0); \
+		MOV64(b12, c1); \
+		MOV64(b22, c2); \
+		MOV64(b32, c3); \
+		MOV64(b42, c4); \
+		NOT64(bnn, b33); \
+		KHI_XA(c0, b03, b13, b23); \
+		KHI_XO(c1, b13, b23, b33); \
+		KHI_XO(c2, b23, bnn, b43); \
+		KHI_XA(c3, bnn, b43, b03); \
+		KHI_XO(c4, b43, b03, b13); \
+		MOV64(b03, c0); \
+		MOV64(b13, c1); \
+		MOV64(b23, c2); \
+		MOV64(b33, c3); \
+		MOV64(b43, c4); \
+		NOT64(bnn, b14); \
+		KHI_XA(c0, b04, bnn, b24); \
+		KHI_XO(c1, bnn, b24, b34); \
+		KHI_XA(c2, b24, b34, b44); \
+		KHI_XO(c3, b34, b44, b04); \
+		KHI_XA(c4, b44, b04, b14); \
+		MOV64(b04, c0); \
+		MOV64(b14, c1); \
+		MOV64(b24, c2); \
+		MOV64(b34, c3); \
+		MOV64(b44, c4); \
+	} while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P1_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a30); \
+		MOV64(a30, a33); \
+		MOV64(a33, a23); \
+		MOV64(a23, a12); \
+		MOV64(a12, a21); \
+		MOV64(a21, a02); \
+		MOV64(a02, a10); \
+		MOV64(a10, a11); \
+		MOV64(a11, a41); \
+		MOV64(a41, a24); \
+		MOV64(a24, a42); \
+		MOV64(a42, a04); \
+		MOV64(a04, a20); \
+		MOV64(a20, a22); \
+		MOV64(a22, a32); \
+		MOV64(a32, a43); \
+		MOV64(a43, a34); \
+		MOV64(a34, a03); \
+		MOV64(a03, a40); \
+		MOV64(a40, a44); \
+		MOV64(a44, a14); \
+		MOV64(a14, a31); \
+		MOV64(a31, a13); \
+		MOV64(a13, t); \
+	} while (0)
+
+#define P2_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a33); \
+		MOV64(a33, a12); \
+		MOV64(a12, a02); \
+		MOV64(a02, a11); \
+		MOV64(a11, a24); \
+		MOV64(a24, a04); \
+		MOV64(a04, a22); \
+		MOV64(a22, a43); \
+		MOV64(a43, a03); \
+		MOV64(a03, a44); \
+		MOV64(a44, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a41); \
+		MOV64(a41, a42); \
+		MOV64(a42, a20); \
+		MOV64(a20, a32); \
+		MOV64(a32, a34); \
+		MOV64(a34, a40); \
+		MOV64(a40, a14); \
+		MOV64(a14, a13); \
+		MOV64(a13, a30); \
+		MOV64(a30, a23); \
+		MOV64(a23, a21); \
+		MOV64(a21, t); \
+	} while (0)
+
+#define P4_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a12); \
+		MOV64(a12, a11); \
+		MOV64(a11, a04); \
+		MOV64(a04, a43); \
+		MOV64(a43, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a24); \
+		MOV64(a24, a22); \
+		MOV64(a22, a03); \
+		MOV64(a03, a31); \
+		MOV64(a31, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a42); \
+		MOV64(a42, a32); \
+		MOV64(a32, a40); \
+		MOV64(a40, a13); \
+		MOV64(a13, a23); \
+		MOV64(a23, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a30); \
+		MOV64(a30, a21); \
+		MOV64(a21, a41); \
+		MOV64(a41, a20); \
+		MOV64(a20, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P6_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a02); \
+		MOV64(a02, a04); \
+		MOV64(a04, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a20); \
+		MOV64(a20, a40); \
+		MOV64(a40, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a22); \
+		MOV64(a22, a44); \
+		MOV64(a44, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a24); \
+		MOV64(a24, a43); \
+		MOV64(a43, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a21); \
+		MOV64(a21, a42); \
+		MOV64(a42, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a23); \
+		MOV64(a23, a41); \
+		MOV64(a41, a32); \
+		MOV64(a32, t); \
+	} while (0)
+
+#define P8_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a11); \
+		MOV64(a11, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a22); \
+		MOV64(a22, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a03); \
+		MOV64(a03, a33); \
+		MOV64(a33, a24); \
+		MOV64(a24, t); \
+		MOV64(t, a04); \
+		MOV64(a04, a44); \
+		MOV64(a44, a12); \
+		MOV64(a12, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a32); \
+		MOV64(a32, a13); \
+		MOV64(a13, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a21); \
+		MOV64(a21, a20); \
+		MOV64(a20, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a42); \
+		MOV64(a42, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a30); \
+		MOV64(a30, a41); \
+		MOV64(a41, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P12_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a04); \
+		MOV64(a04, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a42); \
+		MOV64(a42, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a41); \
+		MOV64(a41, t); \
+		MOV64(t, a20); \
+		MOV64(a20, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a21); \
+		MOV64(a21, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a22); \
+		MOV64(a22, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a32); \
+		MOV64(a32, t); \
+		MOV64(t, a24); \
+		MOV64(a24, a31); \
+		MOV64(a31, t); \
+	} while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+		THETA LPAR P ## r RPAR; \
+		RHO LPAR P ## r RPAR; \
+		KHI LPAR P ## s RPAR; \
+		IOTA(k); \
+	} while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#if SPH_KECCAK_UNROLL == 1
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j ++) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			P1_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 2
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 2) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			P2_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 4
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 4) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			P4_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 6
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 6) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			P6_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 8
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 8) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			KF_ELT( 6,  7, RC[j + 6]); \
+			KF_ELT( 7,  8, RC[j + 7]); \
+			P8_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 12
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 12) { \
+			KF_ELT( 0,  1, RC[j +  0]); \
+			KF_ELT( 1,  2, RC[j +  1]); \
+			KF_ELT( 2,  3, RC[j +  2]); \
+			KF_ELT( 3,  4, RC[j +  3]); \
+			KF_ELT( 4,  5, RC[j +  4]); \
+			KF_ELT( 5,  6, RC[j +  5]); \
+			KF_ELT( 6,  7, RC[j +  6]); \
+			KF_ELT( 7,  8, RC[j +  7]); \
+			KF_ELT( 8,  9, RC[j +  8]); \
+			KF_ELT( 9, 10, RC[j +  9]); \
+			KF_ELT(10, 11, RC[j + 10]); \
+			KF_ELT(11, 12, RC[j + 11]); \
+			P12_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 0
+
+#define KECCAK_F_1600_   do { \
+		KF_ELT( 0,  1, RC[ 0]); \
+		KF_ELT( 1,  2, RC[ 1]); \
+		KF_ELT( 2,  3, RC[ 2]); \
+		KF_ELT( 3,  4, RC[ 3]); \
+		KF_ELT( 4,  5, RC[ 4]); \
+		KF_ELT( 5,  6, RC[ 5]); \
+		KF_ELT( 6,  7, RC[ 6]); \
+		KF_ELT( 7,  8, RC[ 7]); \
+		KF_ELT( 8,  9, RC[ 8]); \
+		KF_ELT( 9, 10, RC[ 9]); \
+		KF_ELT(10, 11, RC[10]); \
+		KF_ELT(11, 12, RC[11]); \
+		KF_ELT(12, 13, RC[12]); \
+		KF_ELT(13, 14, RC[13]); \
+		KF_ELT(14, 15, RC[14]); \
+		KF_ELT(15, 16, RC[15]); \
+		KF_ELT(16, 17, RC[16]); \
+		KF_ELT(17, 18, RC[17]); \
+		KF_ELT(18, 19, RC[18]); \
+		KF_ELT(19, 20, RC[19]); \
+		KF_ELT(20, 21, RC[20]); \
+		KF_ELT(21, 22, RC[21]); \
+		KF_ELT(22, 23, RC[22]); \
+		KF_ELT(23,  0, RC[23]); \
+	} while (0)
+
+#else
+
+#error Unimplemented unroll count for Keccak.
+
+#endif
+
+static void
+keccak_init(sph_keccak_context *kc, unsigned out_size)
+{
+	int i;
+
+#if SPH_KECCAK_64
+	for (i = 0; i < 25; i ++)
+		kc->u.wide[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 */
+	kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+#else
+
+	for (i = 0; i < 50; i ++)
+		kc->u.narrow[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 * Note: since we set to all-one full 64-bit words,
+	 * interleaving (if applicable) is a no-op.
+	 */
+	kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[16] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[17] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[24] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[25] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[34] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[35] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[40] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[41] = SPH_C32(0xFFFFFFFF);
+#endif
+	kc->ptr = 0;
+	kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = kc->buf;
+	ptr = kc->ptr;
+
+	if (len < (lim - ptr)) {
+		memcpy(buf + ptr, data, len);
+		kc->ptr = ptr + len;
+		return;
+	}
+
+	READ_STATE(kc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (lim - ptr);
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == lim) {
+			INPUT_BUF(lim);
+			KECCAK_F_1600;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(kc);
+	kc->ptr = ptr;
+}
+
+#if SPH_KECCAK_64
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.wide[ 1] = ~kc->u.wide[ 1]; \
+		kc->u.wide[ 2] = ~kc->u.wide[ 2]; \
+		kc->u.wide[ 8] = ~kc->u.wide[ 8]; \
+		kc->u.wide[12] = ~kc->u.wide[12]; \
+		kc->u.wide[17] = ~kc->u.wide[17]; \
+		kc->u.wide[20] = ~kc->u.wide[20]; \
+		for (j = 0; j < d; j += 8) \
+			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#else
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \
+		kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \
+		kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \
+		kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \
+		kc->u.narrow[16] = ~kc->u.narrow[16]; \
+		kc->u.narrow[17] = ~kc->u.narrow[17]; \
+		kc->u.narrow[24] = ~kc->u.narrow[24]; \
+		kc->u.narrow[25] = ~kc->u.narrow[25]; \
+		kc->u.narrow[34] = ~kc->u.narrow[34]; \
+		kc->u.narrow[35] = ~kc->u.narrow[35]; \
+		kc->u.narrow[40] = ~kc->u.narrow[40]; \
+		kc->u.narrow[41] = ~kc->u.narrow[41]; \
+		/* un-interleave */ \
+		for (j = 0; j < 50; j += 2) \
+			UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \
+		for (j = 0; j < d; j += 4) \
+			sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#endif
+
+DEFCLOSE(28, 144)
+DEFCLOSE(32, 136)
+DEFCLOSE(48, 104)
+DEFCLOSE(64, 72)
+
+/* see sph_keccak.h */
+void
+sph_keccak224_init(void *cc)
+{
+	keccak_init(cc, 224);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 144);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_close(void *cc, void *dst)
+{
+	sph_keccak224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close28(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_init(void *cc)
+{
+	keccak_init(cc, 256);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 136);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_close(void *cc, void *dst)
+{
+	sph_keccak256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close32(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_init(void *cc)
+{
+	keccak_init(cc, 384);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 104);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_close(void *cc, void *dst)
+{
+	sph_keccak384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close48(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_init(void *cc)
+{
+	keccak_init(cc, 512);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 72);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_close(void *cc, void *dst)
+{
+	sph_keccak512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close64(cc, ub, n, dst);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/miner.h b/miner.h
new file mode 100644
index 0000000..0a743e7
--- /dev/null
+++ b/miner.h
@@ -0,0 +1,315 @@
+#ifndef __MINER_H__
+#define __MINER_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cpuminer-config.h"
+
+#include <stdbool.h>
+#include <inttypes.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <jansson.h>
+#include <curl/curl.h>
+
+#ifdef WIN32
+#define snprintf(...) _snprintf(__VA_ARGS__)
+#define strdup(x) _strdup(x)
+#define strncasecmp(x,y,z) _strnicmp(x,y,z)
+#define strcasecmp(x,y) _stricmp(x,y)
+typedef int ssize_t;
+#endif
+
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_ALLOCA_H
+# include <alloca.h>
+#elif !defined alloca
+# ifdef __GNUC__
+#  define alloca __builtin_alloca
+# elif defined _AIX
+#  define alloca __alloca
+# elif defined _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# elif !defined HAVE_ALLOCA
+#  ifdef  __cplusplus
+extern "C"
+#  endif
+void *alloca (size_t);
+# endif
+#endif
+
+#ifdef HAVE_SYSLOG_H
+#include <syslog.h>
+#else
+enum {
+	LOG_ERR,
+	LOG_WARNING,
+	LOG_NOTICE,
+	LOG_INFO,
+	LOG_DEBUG,
+};
+#endif
+
+#undef unlikely
+#undef likely
+#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
+#define unlikely(expr) (__builtin_expect(!!(expr), 0))
+#define likely(expr) (__builtin_expect(!!(expr), 1))
+#else
+#define unlikely(expr) (expr)
+#define likely(expr) (expr)
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+#define WANT_BUILTIN_BSWAP
+#else
+#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+                   | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
+static inline uint32_t swab32(uint32_t v)
+{
+#ifdef WANT_BUILTIN_BSWAP
+	return __builtin_bswap32(v);
+#else
+	return bswap_32(v);
+#endif
+}
+
+#ifdef HAVE_SYS_ENDIAN_H
+#include <sys/endian.h>
+#endif
+
+#if !HAVE_DECL_BE32DEC
+static inline uint32_t be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_LE32DEC
+static inline uint32_t le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+#endif
+
+#if !HAVE_DECL_BE32ENC
+static inline void be32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_LE32ENC
+static inline void le32enc(void *pp, uint32_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_BE16DEC
+static inline uint16_t be16dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint16_t)(p[1]) + ((uint16_t)(p[0]) << 8));
+}
+#endif
+
+#if !HAVE_DECL_BE16ENC
+static inline void be16enc(void *pp, uint16_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[1] = x & 0xff;
+	p[0] = (x >> 8) & 0xff;
+}
+#endif
+
+#if !HAVE_DECL_LE16DEC
+static inline uint16_t le16dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+	return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8));
+}
+#endif
+
+#if !HAVE_DECL_LE16ENC
+static inline void le16enc(void *pp, uint16_t x)
+{
+	uint8_t *p = (uint8_t *)pp;
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+}
+#endif
+
+#if JANSSON_MAJOR_VERSION >= 2
+#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
+#else
+#define JSON_LOADS(str, err_ptr) json_loads((str), (err_ptr))
+#endif
+
+#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
+
+void sha256_init(uint32_t *state);
+void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
+void sha256d(unsigned char *hash, const unsigned char *data, int len);
+
+#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
+#define HAVE_SHA256_4WAY 0
+int sha256_use_4way();
+void sha256_init_4way(uint32_t *state);
+void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+#endif
+
+#if defined(__x86_64__) && defined(USE_AVX2)
+#define HAVE_SHA256_8WAY 0
+int sha256_use_8way();
+void sha256_init_8way(uint32_t *state);
+void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
+#endif
+
+extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
+
+extern unsigned char *scrypt_buffer_alloc();
+
+extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
+	unsigned char *scratchbuf, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done);
+
+extern int scanhash_heavy(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done, uint32_t maxvote);
+
+extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
+extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+
+struct thr_info {
+	int		id;
+	pthread_t	pth;
+	struct thread_q	*q;
+};
+
+struct work_restart {
+	volatile unsigned long	restart;
+	char			padding[128 - sizeof(unsigned long)];
+};
+
+extern bool opt_debug;
+extern bool opt_protocol;
+extern int opt_timeout;
+extern bool want_longpoll;
+extern bool have_longpoll;
+extern bool want_stratum;
+extern bool have_stratum;
+extern char *opt_cert;
+extern char *opt_proxy;
+extern long opt_proxy_type;
+extern bool use_syslog;
+extern pthread_mutex_t applog_lock;
+extern struct thr_info *thr_info;
+extern int longpoll_thr_id;
+extern int stratum_thr_id;
+extern struct work_restart *work_restart;
+extern bool opt_trust_pool;
+extern uint16_t opt_vote;
+
+extern void applog(int prio, const char *fmt, ...);
+extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
+	const char *rpc_req, bool, bool, int *);
+extern char *bin2hex(const unsigned char *p, size_t len);
+extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
+extern int timeval_subtract(struct timeval *result, struct timeval *x,
+	struct timeval *y);
+extern bool fulltest(const uint32_t *hash, const uint32_t *target);
+extern void diff_to_target(uint32_t *target, double diff);
+
+struct stratum_job {
+	char *job_id;
+	unsigned char prevhash[32];
+	size_t coinbase_size;
+	unsigned char *coinbase;
+	unsigned char *xnonce2;
+	int merkle_count;
+	unsigned char **merkle;
+	unsigned char version[4];
+	unsigned char nbits[4];
+	unsigned char ntime[4];
+	bool clean;
+	unsigned char nreward[2];
+	double diff;
+};
+
+struct stratum_ctx {
+	char *url;
+
+	CURL *curl;
+	char *curl_url;
+	char curl_err_str[CURL_ERROR_SIZE];
+	curl_socket_t sock;
+	size_t sockbuf_size;
+	char *sockbuf;
+	pthread_mutex_t sock_lock;
+
+	double next_diff;
+
+	char *session_id;
+	size_t xnonce1_size;
+	unsigned char *xnonce1;
+	size_t xnonce2_size;
+	struct stratum_job job;
+	pthread_mutex_t work_lock;
+};
+
+bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
+bool stratum_send_line(struct stratum_ctx *sctx, char *s);
+char *stratum_recv_line(struct stratum_ctx *sctx);
+bool stratum_connect(struct stratum_ctx *sctx, const char *url);
+void stratum_disconnect(struct stratum_ctx *sctx);
+bool stratum_subscribe(struct stratum_ctx *sctx);
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
+bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
+
+struct thread_q;
+
+extern struct thread_q *tq_new(void);
+extern void tq_free(struct thread_q *tq);
+extern bool tq_push(struct thread_q *tq, void *data);
+extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
+extern void tq_freeze(struct thread_q *tq);
+extern void tq_thaw(struct thread_q *tq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __MINER_H__ */
diff --git a/missing b/missing
new file mode 100644
index 0000000..1c8ff70
--- /dev/null
+++ b/missing
@@ -0,0 +1,367 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+
+scriptversion=2006-05-10.23
+
+# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
+#   Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
+sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
+
+# In the cases where this matters, `missing' is being run in the
+# srcdir already.
+if test -f configure.ac; then
+  configure_ac=configure.ac
+else
+  configure_ac=configure.in
+fi
+
+msg="missing on your system"
+
+case $1 in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  # Exit code 63 means version mismatch.  This often happens
+  # when the user try to use an ancient version of a tool on
+  # a file that requires a minimum version.  In this case we
+  # we should proceed has if the program had been absent, or
+  # if --run hadn't been passed.
+  if test $? = 63; then
+    run=:
+    msg="probably too old"
+  fi
+  ;;
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  autom4te     touch the output file, or create a stub one
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  help2man     touch the output file
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+
+Send bug reports to <bug-automake@gnu.org>."
+    exit $?
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing $scriptversion (GNU Automake)"
+    exit $?
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+esac
+
+# Now exit if we have it, but it failed.  Also exit now if we
+# don't have it and --version was passed (most likely to detect
+# the program).
+case $1 in
+  lex|yacc)
+    # Not GNU programs, they don't have --version.
+    ;;
+
+  tar)
+    if test -n "$run"; then
+       echo 1>&2 "ERROR: \`tar' requires --run"
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       exit 1
+    fi
+    ;;
+
+  *)
+    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
+       # We have it, but it failed.
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       # Could not run --version or --help.  This is probably someone
+       # running `$TOOL --version' or `$TOOL --help' to check whether
+       # $TOOL exists and not knowing $TOOL uses missing.
+       exit 1
+    fi
+    ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case $1 in
+  aclocal*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`${configure_ac}'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case $f in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  autom4te)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, but is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.
+         You can get \`$1' as part of \`Autoconf' from any GNU
+         archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo "#! /bin/sh"
+	echo "# Created by GNU Automake missing as a replacement of"
+	echo "#  $ $@"
+	echo "exit 0"
+	chmod +x $file
+	exit 1
+    fi
+    ;;
+
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' $msg.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f y.tab.h; then
+	echo >y.tab.h
+    fi
+    if test ! -f y.tab.c; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f lex.yy.c; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  help2man)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+	 you modified a dependency of a manual page.  You may need the
+	 \`Help2man' package in order for those modifications to take
+	 effect.  You can get \`Help2man' from any GNU archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo ".ab help2man is required to generate this page"
+	exit 1
+    fi
+    ;;
+
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    # The file to touch is that specified with -o ...
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -z "$file"; then
+      # ... or it is the one specified with @setfilename ...
+      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '
+	/^@setfilename/{
+	  s/.* \([^ ]*\) *$/\1/
+	  p
+	  q
+	}' $infile`
+      # ... or it is derived from the source name (dir/f.texi becomes f.info)
+      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
+    fi
+    # If the file does not exist, the user really needs makeinfo;
+    # let's fail without touching anything.
+    test -f $file || exit 1
+    touch $file
+    ;;
+
+  tar)
+    shift
+
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar "$@" && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar "$@" && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case $firstarg in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+	case $firstarg in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+    fi
+
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequisites for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
diff --git a/scrypt.c b/scrypt.c
new file mode 100644
index 0000000..5efd0e2
--- /dev/null
+++ b/scrypt.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+static const uint32_t keypad[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32_t innerpad[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32_t outerpad[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32_t finalblk[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init(const uint32_t *key,
+	uint32_t *tstate, uint32_t *ostate)
+{
+	uint32_t ihash[8];
+	uint32_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 16, 16);
+	memcpy(pad + 4, keypad, 48);
+	sha256_transform(tstate, pad, 0);
+	memcpy(ihash, tstate, 32);
+
+	sha256_init(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transform(ostate, pad, 0);
+
+	sha256_init(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transform(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
+	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t istate[8], ostate2[8];
+	uint32_t ibuf[16], obuf[16];
+	int i, j;
+
+	memcpy(istate, tstate, 32);
+	sha256_transform(istate, salt, 0);
+	
+	memcpy(ibuf, salt + 16, 16);
+	memcpy(ibuf + 5, innerpad, 44);
+	memcpy(obuf + 8, outerpad, 32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 32);
+		ibuf[4] = i + 1;
+		sha256_transform(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 32);
+		sha256_transform(ostate2, obuf, 0);
+		for (j = 0; j < 8; j++)
+			output[8 * i + j] = swab32(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
+	const uint32_t *salt, uint32_t *output)
+{
+	uint32_t buf[16];
+	int i;
+	
+	sha256_transform(tstate, salt, 1);
+	sha256_transform(tstate, salt + 16, 1);
+	sha256_transform(tstate, finalblk, 0);
+	memcpy(buf, tstate, 32);
+	memcpy(buf + 8, outerpad, 32);
+
+	sha256_transform(ostate, buf, 0);
+	for (i = 0; i < 8; i++)
+		output[i] = swab32(ostate[i]);
+}
+
+
+#if HAVE_SHA256_4WAY
+
+static const uint32_t keypad_4way[4 * 12] = {
+	0x80000000, 0x80000000, 0x80000000, 0x80000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000280, 0x00000280, 0x00000280, 0x00000280
+};
+static const uint32_t innerpad_4way[4 * 11] = {
+	0x80000000, 0x80000000, 0x80000000, 0x80000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
+};
+static const uint32_t outerpad_4way[4 * 8] = {
+	0x80000000, 0x80000000, 0x80000000, 0x80000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000300, 0x00000300, 0x00000300, 0x00000300
+};
+static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
+	0x00000001, 0x00000001, 0x00000001, 0x00000001,
+	0x80000000, 0x80000000, 0x80000000, 0x80000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000620, 0x00000620, 0x00000620, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
+	uint32_t *tstate, uint32_t *ostate)
+{
+	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
+	uint32_t pad[4 * 16] __attribute__((aligned(16)));
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 4 * 16, 4 * 16);
+	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
+	sha256_transform_4way(tstate, pad, 0);
+	memcpy(ihash, tstate, 4 * 32);
+
+	sha256_init_4way(ostate);
+	for (i = 0; i < 4 * 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 4 * 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transform_4way(ostate, pad, 0);
+
+	sha256_init_4way(tstate);
+	for (i = 0; i < 4 * 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 4 * 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transform_4way(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
+	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t istate[4 * 8] __attribute__((aligned(16)));
+	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
+	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
+	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
+	int i, j;
+
+	memcpy(istate, tstate, 4 * 32);
+	sha256_transform_4way(istate, salt, 0);
+	
+	memcpy(ibuf, salt + 4 * 16, 4 * 16);
+	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
+	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 4 * 32);
+		ibuf[4 * 4 + 0] = i + 1;
+		ibuf[4 * 4 + 1] = i + 1;
+		ibuf[4 * 4 + 2] = i + 1;
+		ibuf[4 * 4 + 3] = i + 1;
+		sha256_transform_4way(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 4 * 32);
+		sha256_transform_4way(ostate2, obuf, 0);
+		for (j = 0; j < 4 * 8; j++)
+			output[4 * 8 * i + j] = swab32(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
+	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t buf[4 * 16] __attribute__((aligned(16)));
+	int i;
+	
+	sha256_transform_4way(tstate, salt, 1);
+	sha256_transform_4way(tstate, salt + 4 * 16, 1);
+	sha256_transform_4way(tstate, finalblk_4way, 0);
+	memcpy(buf, tstate, 4 * 32);
+	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
+
+	sha256_transform_4way(ostate, buf, 0);
+	for (i = 0; i < 4 * 8; i++)
+		output[i] = swab32(ostate[i]);
+}
+
+#endif /* HAVE_SHA256_4WAY */
+
+
+#if HAVE_SHA256_8WAY
+
+static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
+	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
+	uint32_t *tstate, uint32_t *ostate)
+{
+	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
+	uint32_t pad[8 * 16] __attribute__((aligned(32)));
+	int i;
+	
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 8 * 16, 8 * 16);
+	for (i = 0; i < 8; i++)
+		pad[8 * 4 + i] = 0x80000000;
+	memset(pad + 8 * 5, 0x00, 8 * 40);
+	for (i = 0; i < 8; i++)
+		pad[8 * 15 + i] = 0x00000280;
+	sha256_transform_8way(tstate, pad, 0);
+	memcpy(ihash, tstate, 8 * 32);
+	
+	sha256_init_8way(ostate);
+	for (i = 0; i < 8 * 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 8 * 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transform_8way(ostate, pad, 0);
+	
+	sha256_init_8way(tstate);
+	for (i = 0; i < 8 * 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 8 * 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transform_8way(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
+	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t istate[8 * 8] __attribute__((aligned(32)));
+	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
+	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
+	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
+	int i, j;
+	
+	memcpy(istate, tstate, 8 * 32);
+	sha256_transform_8way(istate, salt, 0);
+	
+	memcpy(ibuf, salt + 8 * 16, 8 * 16);
+	for (i = 0; i < 8; i++)
+		ibuf[8 * 5 + i] = 0x80000000;
+	memset(ibuf + 8 * 6, 0x00, 8 * 36);
+	for (i = 0; i < 8; i++)
+		ibuf[8 * 15 + i] = 0x000004a0;
+	
+	for (i = 0; i < 8; i++)
+		obuf[8 * 8 + i] = 0x80000000;
+	memset(obuf + 8 * 9, 0x00, 8 * 24);
+	for (i = 0; i < 8; i++)
+		obuf[8 * 15 + i] = 0x00000300;
+	
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 8 * 32);
+		ibuf[8 * 4 + 0] = i + 1;
+		ibuf[8 * 4 + 1] = i + 1;
+		ibuf[8 * 4 + 2] = i + 1;
+		ibuf[8 * 4 + 3] = i + 1;
+		ibuf[8 * 4 + 4] = i + 1;
+		ibuf[8 * 4 + 5] = i + 1;
+		ibuf[8 * 4 + 6] = i + 1;
+		ibuf[8 * 4 + 7] = i + 1;
+		sha256_transform_8way(obuf, ibuf, 0);
+		
+		memcpy(ostate2, ostate, 8 * 32);
+		sha256_transform_8way(ostate2, obuf, 0);
+		for (j = 0; j < 8 * 8; j++)
+			output[8 * 8 * i + j] = swab32(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
+	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t buf[8 * 16] __attribute__((aligned(32)));
+	int i;
+	
+	sha256_transform_8way(tstate, salt, 1);
+	sha256_transform_8way(tstate, salt + 8 * 16, 1);
+	sha256_transform_8way(tstate, finalblk_8way, 0);
+	
+	memcpy(buf, tstate, 8 * 32);
+	for (i = 0; i < 8; i++)
+		buf[8 * 8 + i] = 0x80000000;
+	memset(buf + 8 * 9, 0x00, 8 * 24);
+	for (i = 0; i < 8; i++)
+		buf[8 * 15 + i] = 0x00000300;
+	sha256_transform_8way(ostate, buf, 0);
+	
+	for (i = 0; i < 8 * 8; i++)
+		output[i] = swab32(ostate[i]);
+}
+
+#endif /* HAVE_SHA256_8WAY */
+
+
+#if defined(__x86_64__)
+
+#define SCRYPT_MAX_WAYS 1
+#define HAVE_SCRYPT_3WAY 0
+#define scrypt_best_throughput() 1
+static void scrypt_core(uint32_t *X, uint32_t *V);
+void scrypt_core_3way(uint32_t *X, uint32_t *V);
+#if defined(USE_AVX2)
+#undef SCRYPT_MAX_WAYS
+#define SCRYPT_MAX_WAYS 21
+#define HAVE_SCRYPT_6WAY 0
+void scrypt_core_6way(uint32_t *X, uint32_t *V);
+#endif
+
+#elif defined(__i386__)
+
+#define SCRYPT_MAX_WAYS 1
+#define scrypt_best_throughput() 1
+static void scrypt_core(uint32_t *X, uint32_t *V);
+
+#elif defined(__arm__) && defined(__APCS_32__)
+
+static void scrypt_core(uint32_t *X, uint32_t *V);
+#if defined(__ARM_NEON__)
+#undef HAVE_SHA256_4WAY
+#define SCRYPT_MAX_WAYS 1
+#define HAVE_SCRYPT_3WAY 0
+#define scrypt_best_throughput() 1
+void scrypt_core_3way(uint32_t *X, uint32_t *V);
+#endif
+
+#endif
+
+static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
+{
+	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
+	int i;
+
+	x00 = (B[ 0] ^= Bx[ 0]);
+	x01 = (B[ 1] ^= Bx[ 1]);
+	x02 = (B[ 2] ^= Bx[ 2]);
+	x03 = (B[ 3] ^= Bx[ 3]);
+	x04 = (B[ 4] ^= Bx[ 4]);
+	x05 = (B[ 5] ^= Bx[ 5]);
+	x06 = (B[ 6] ^= Bx[ 6]);
+	x07 = (B[ 7] ^= Bx[ 7]);
+	x08 = (B[ 8] ^= Bx[ 8]);
+	x09 = (B[ 9] ^= Bx[ 9]);
+	x10 = (B[10] ^= Bx[10]);
+	x11 = (B[11] ^= Bx[11]);
+	x12 = (B[12] ^= Bx[12]);
+	x13 = (B[13] ^= Bx[13]);
+	x14 = (B[14] ^= Bx[14]);
+	x15 = (B[15] ^= Bx[15]);
+	for (i = 0; i < 8; i += 2) {
+#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns. */
+		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
+		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
+		
+		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
+		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
+		
+		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
+		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
+		
+		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
+		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
+		
+		/* Operate on rows. */
+		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
+		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
+		
+		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
+		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
+		
+		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
+		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
+		
+		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
+		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
+#undef R
+	}
+	B[ 0] += x00;
+	B[ 1] += x01;
+	B[ 2] += x02;
+	B[ 3] += x03;
+	B[ 4] += x04;
+	B[ 5] += x05;
+	B[ 6] += x06;
+	B[ 7] += x07;
+	B[ 8] += x08;
+	B[ 9] += x09;
+	B[10] += x10;
+	B[11] += x11;
+	B[12] += x12;
+	B[13] += x13;
+	B[14] += x14;
+	B[15] += x15;
+}
+
+static inline void scrypt_core(uint32_t *X, uint32_t *V)
+{
+	uint32_t i, j, k;
+	
+	for (i = 0; i < 1024; i++) {
+		memcpy(&V[i * 32], X, 128);
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+	for (i = 0; i < 1024; i++) {
+		j = 32 * (X[16] & 1023);
+		for (k = 0; k < 32; k++)
+			X[k] ^= V[j + k];
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+}
+
+#ifndef SCRYPT_MAX_WAYS
+#define SCRYPT_MAX_WAYS 1
+#define scrypt_best_throughput() 1
+#endif
+
+#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
+
+unsigned char *scrypt_buffer_alloc()
+{
+	return malloc(SCRYPT_BUFFER_SIZE);
+}
+
+static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
+	uint32_t *midstate, unsigned char *scratchpad)
+{
+	uint32_t tstate[8], ostate[8];
+	uint32_t X[32];
+	uint32_t *V;
+	
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	memcpy(tstate, midstate, 32);
+	HMAC_SHA256_80_init(input, tstate, ostate);
+	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
+
+	scrypt_core(X, V);
+
+	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
+}
+
+#if HAVE_SHA256_4WAY
+static void scrypt_1024_1_1_256_4way(const uint32_t *input,
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+{
+	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
+	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
+	uint32_t W[4 * 32] __attribute__((aligned(128)));
+	uint32_t X[4 * 32] __attribute__((aligned(128)));
+	uint32_t *V;
+	int i, k;
+	
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	for (i = 0; i < 20; i++)
+		for (k = 0; k < 4; k++)
+			W[4 * i + k] = input[k * 20 + i];
+	for (i = 0; i < 8; i++)
+		for (k = 0; k < 4; k++)
+			tstate[4 * i + k] = midstate[i];
+	HMAC_SHA256_80_init_4way(W, tstate, ostate);
+	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
+	for (i = 0; i < 32; i++)
+		for (k = 0; k < 4; k++)
+			X[k * 32 + i] = W[4 * i + k];
+	scrypt_core(X + 0 * 32, V);
+	scrypt_core(X + 1 * 32, V);
+	scrypt_core(X + 2 * 32, V);
+	scrypt_core(X + 3 * 32, V);
+	for (i = 0; i < 32; i++)
+		for (k = 0; k < 4; k++)
+			W[4 * i + k] = X[k * 32 + i];
+	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
+	for (i = 0; i < 8; i++)
+		for (k = 0; k < 4; k++)
+			output[k * 8 + i] = W[4 * i + k];
+}
+#endif /* HAVE_SHA256_4WAY */
+
+#if HAVE_SCRYPT_3WAY
+
+static void scrypt_1024_1_1_256_3way(const uint32_t *input,
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+{
+	uint32_t tstate[3 * 8], ostate[3 * 8];
+	uint32_t X[3 * 32] __attribute__((aligned(64)));
+	uint32_t *V;
+	
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	memcpy(tstate +  0, midstate, 32);
+	memcpy(tstate +  8, midstate, 32);
+	memcpy(tstate + 16, midstate, 32);
+	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
+	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
+	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
+	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
+	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
+	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
+
+	scrypt_core_3way(X, V);
+
+	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
+	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
+	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
+}
+
+#if HAVE_SHA256_4WAY
+static void scrypt_1024_1_1_256_12way(const uint32_t *input,
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+{
+	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
+	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
+	uint32_t W[12 * 32] __attribute__((aligned(128)));
+	uint32_t X[12 * 32] __attribute__((aligned(128)));
+	uint32_t *V;
+	int i, j, k;
+	
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 20; i++)
+			for (k = 0; k < 4; k++)
+				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 8; i++)
+			for (k = 0; k < 4; k++)
+				tstate[32 * j + 4 * i + k] = midstate[i];
+	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
+	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
+	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
+	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
+	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
+	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 32; i++)
+			for (k = 0; k < 4; k++)
+				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
+	scrypt_core_3way(X + 0 * 96, V);
+	scrypt_core_3way(X + 1 * 96, V);
+	scrypt_core_3way(X + 2 * 96, V);
+	scrypt_core_3way(X + 3 * 96, V);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 32; i++)
+			for (k = 0; k < 4; k++)
+				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
+	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
+	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
+	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 8; i++)
+			for (k = 0; k < 4; k++)
+				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
+}
+#endif /* HAVE_SHA256_4WAY */
+
+#endif /* HAVE_SCRYPT_3WAY */
+
+#if HAVE_SCRYPT_6WAY
+static void scrypt_1024_1_1_256_24way(const uint32_t *input,
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+{
+	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
+	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
+	uint32_t W[24 * 32] __attribute__((aligned(128)));
+	uint32_t X[24 * 32] __attribute__((aligned(128)));
+	uint32_t *V;
+	int i, j, k;
+	
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	
+	for (j = 0; j < 3; j++) 
+		for (i = 0; i < 20; i++)
+			for (k = 0; k < 8; k++)
+				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 8; i++)
+			for (k = 0; k < 8; k++)
+				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
+	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
+	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
+	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
+	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
+	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
+	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 32; i++)
+			for (k = 0; k < 8; k++)
+				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
+	scrypt_core_6way(X + 0 * 32, V);
+	scrypt_core_6way(X + 6 * 32, V);
+	scrypt_core_6way(X + 12 * 32, V);
+	scrypt_core_6way(X + 18 * 32, V);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 32; i++)
+			for (k = 0; k < 8; k++)
+				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
+	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
+	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
+	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
+	for (j = 0; j < 3; j++)
+		for (i = 0; i < 8; i++)
+			for (k = 0; k < 8; k++)
+				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
+}
+#endif /* HAVE_SCRYPT_6WAY */
+
+int scanhash_scrypt(int thr_id, uint32_t *pdata,
+	unsigned char *scratchbuf, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
+	uint32_t midstate[8];
+	uint32_t n = pdata[19] - 1;
+	const uint32_t Htarg = ptarget[7];
+	int throughput = scrypt_best_throughput();
+	int i;
+	
+#if HAVE_SHA256_4WAY
+	if (sha256_use_4way())
+		throughput *= 4;
+#endif
+	
+	for (i = 0; i < throughput; i++)
+		memcpy(data + i * 20, pdata, 80);
+	
+	sha256_init(midstate);
+	sha256_transform(midstate, data, 0);
+	
+	do {
+		for (i = 0; i < throughput; i++)
+			data[i * 20 + 19] = ++n;
+		
+#if defined(HAVE_SHA256_4WAY)
+		if (throughput == 4)
+			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
+		else
+#endif
+#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
+		if (throughput == 12)
+			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
+		else
+#endif
+#if defined(HAVE_SCRYPT_6WAY)
+		if (throughput == 24)
+			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
+		else
+#endif
+#if defined(HAVE_SCRYPT_3WAY)
+		if (throughput == 3)
+			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
+		else
+#endif
+		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
+		
+		for (i = 0; i < throughput; i++) {
+			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
+				*hashes_done = n - pdata[19] + 1;
+				pdata[19] = data[i * 20 + 19];
+				return 1;
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - pdata[19] + 1;
+	pdata[19] = n;
+	return 0;
+}
diff --git a/sha2.c b/sha2.c
new file mode 100644
index 0000000..4bfdcc7
--- /dev/null
+++ b/sha2.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright 2011 ArtForz
+ * Copyright 2011-2013 pooler
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <string.h>
+#include <inttypes.h>
+
+#if defined(__arm__) && defined(__APCS_32__)
+#define EXTERN_SHA256
+#endif
+
+static const uint32_t sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+void sha256_init(uint32_t *state)
+{
+	memcpy(state, sha256_h, 32);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+	    S[(66 - i) % 8], S[(67 - i) % 8], \
+	    S[(68 - i) % 8], S[(69 - i) % 8], \
+	    S[(70 - i) % 8], S[(71 - i) % 8], \
+	    W[i] + sha256_k[i])
+
+#ifndef EXTERN_SHA256
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if (swap) {
+		for (i = 0; i < 16; i++)
+			W[i] = swab32(block[i]);
+	} else
+		memcpy(W, block, 64);
+	for (i = 16; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+#endif /* EXTERN_SHA256 */
+
+
+static const uint32_t sha256d_hash1[16] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
+{
+	uint32_t S[16];
+	int i;
+
+	sha256_init(S);
+	sha256_transform(S, data, 0);
+	sha256_transform(S, data + 16, 0);
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	sha256_init(hash);
+	sha256_transform(hash, S, 0);
+	for (i = 0; i < 8; i++)
+		hash[i] = swab32(hash[i]);
+}
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
+{
+	uint32_t S[16], T[16];
+	int i, r;
+
+	sha256_init(S);
+	for (r = len; r > -9; r -= 64) {
+		if (r < 64)
+			memset(T, 0, 64);
+		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+		if (r >= 0 && r < 64)
+			((unsigned char *)T)[r] = 0x80;
+		for (i = 0; i < 16; i++)
+			T[i] = be32dec(T + i);
+		if (r < 56)
+			T[15] = 8 * len;
+		sha256_transform(S, T, 0);
+	}
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	sha256_init(T);
+	sha256_transform(T, S, 0);
+	for (i = 0; i < 8; i++)
+		be32enc((uint32_t *)hash + i, T[i]);
+}
+
+static inline void sha256d_preextend(uint32_t *W)
+{
+	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
+	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
+	W[18] = s1(W[16]) + W[11]             + W[ 2];
+	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
+	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
+	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
+	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
+	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
+	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
+	W[25] =                     s0(W[10]) + W[ 9];
+	W[26] =                     s0(W[11]) + W[10];
+	W[27] =                     s0(W[12]) + W[11];
+	W[28] =                     s0(W[13]) + W[12];
+	W[29] =                     s0(W[14]) + W[13];
+	W[30] =                     s0(W[15]) + W[14];
+	W[31] =                     s0(W[16]) + W[15];
+}
+
+static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
+{
+	uint32_t t0, t1;
+	RNDr(S, W, 0);
+	RNDr(S, W, 1);
+	RNDr(S, W, 2);
+}
+
+#ifdef EXTERN_SHA256
+
+void sha256d_ms(uint32_t *hash, uint32_t *W,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+#else
+
+static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
+	const uint32_t *midstate, const uint32_t *prehash)
+{
+	uint32_t S[64];
+	uint32_t t0, t1;
+	int i;
+
+	S[18] = W[18];
+	S[19] = W[19];
+	S[20] = W[20];
+	S[22] = W[22];
+	S[23] = W[23];
+	S[24] = W[24];
+	S[30] = W[30];
+	S[31] = W[31];
+
+	W[18] += s0(W[3]);
+	W[19] += W[3];
+	W[20] += s1(W[18]);
+	W[21]  = s1(W[19]);
+	W[22] += s1(W[20]);
+	W[23] += s1(W[21]);
+	W[24] += s1(W[22]);
+	W[25]  = s1(W[23]) + W[18];
+	W[26]  = s1(W[24]) + W[19];
+	W[27]  = s1(W[25]) + W[20];
+	W[28]  = s1(W[26]) + W[21];
+	W[29]  = s1(W[27]) + W[22];
+	W[30] += s1(W[28]) + W[23];
+	W[31] += s1(W[29]) + W[24];
+	for (i = 32; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	memcpy(S, prehash, 32);
+
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	for (i = 0; i < 8; i++)
+		S[i] += midstate[i];
+	
+	W[18] = S[18];
+	W[19] = S[19];
+	W[20] = S[20];
+	W[22] = S[22];
+	W[23] = S[23];
+	W[24] = S[24];
+	W[30] = S[30];
+	W[31] = S[31];
+	
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
+	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
+	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
+	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
+	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
+	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
+	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
+	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
+	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
+	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
+	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
+	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
+	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
+	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
+	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
+	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
+	for (i = 32; i < 60; i += 2) {
+		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+	}
+	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
+
+	sha256_init(hash);
+
+	RNDr(hash, S,  0);
+	RNDr(hash, S,  1);
+	RNDr(hash, S,  2);
+	RNDr(hash, S,  3);
+	RNDr(hash, S,  4);
+	RNDr(hash, S,  5);
+	RNDr(hash, S,  6);
+	RNDr(hash, S,  7);
+	RNDr(hash, S,  8);
+	RNDr(hash, S,  9);
+	RNDr(hash, S, 10);
+	RNDr(hash, S, 11);
+	RNDr(hash, S, 12);
+	RNDr(hash, S, 13);
+	RNDr(hash, S, 14);
+	RNDr(hash, S, 15);
+	RNDr(hash, S, 16);
+	RNDr(hash, S, 17);
+	RNDr(hash, S, 18);
+	RNDr(hash, S, 19);
+	RNDr(hash, S, 20);
+	RNDr(hash, S, 21);
+	RNDr(hash, S, 22);
+	RNDr(hash, S, 23);
+	RNDr(hash, S, 24);
+	RNDr(hash, S, 25);
+	RNDr(hash, S, 26);
+	RNDr(hash, S, 27);
+	RNDr(hash, S, 28);
+	RNDr(hash, S, 29);
+	RNDr(hash, S, 30);
+	RNDr(hash, S, 31);
+	RNDr(hash, S, 32);
+	RNDr(hash, S, 33);
+	RNDr(hash, S, 34);
+	RNDr(hash, S, 35);
+	RNDr(hash, S, 36);
+	RNDr(hash, S, 37);
+	RNDr(hash, S, 38);
+	RNDr(hash, S, 39);
+	RNDr(hash, S, 40);
+	RNDr(hash, S, 41);
+	RNDr(hash, S, 42);
+	RNDr(hash, S, 43);
+	RNDr(hash, S, 44);
+	RNDr(hash, S, 45);
+	RNDr(hash, S, 46);
+	RNDr(hash, S, 47);
+	RNDr(hash, S, 48);
+	RNDr(hash, S, 49);
+	RNDr(hash, S, 50);
+	RNDr(hash, S, 51);
+	RNDr(hash, S, 52);
+	RNDr(hash, S, 53);
+	RNDr(hash, S, 54);
+	RNDr(hash, S, 55);
+	RNDr(hash, S, 56);
+	
+	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+	         + S[57] + sha256_k[57];
+	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+	         + S[58] + sha256_k[58];
+	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+	         + S[59] + sha256_k[59];
+	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+	         + S[60] + sha256_k[60]
+	         + sha256_h[7];
+}
+
+#endif /* EXTERN_SHA256 */
+
+#if HAVE_SHA256_4WAY
+
+void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[4 * 64] __attribute__((aligned(128)));
+	uint32_t hash[4 * 8] __attribute__((aligned(32)));
+	uint32_t midstate[4 * 8] __attribute__((aligned(32)));
+	uint32_t prehash[4 * 8] __attribute__((aligned(32)));
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	int i, j;
+	
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+	for (i = 31; i >= 0; i--)
+		for (j = 0; j < 4; j++)
+			data[i * 4 + j] = data[i];
+	
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+	for (i = 7; i >= 0; i--) {
+		for (j = 0; j < 4; j++) {
+			midstate[i * 4 + j] = midstate[i];
+			prehash[i * 4 + j] = prehash[i];
+		}
+	}
+	
+	do {
+		for (i = 0; i < 4; i++)
+			data[4 * 3 + i] = ++n;
+		
+		sha256d_ms_4way(hash, data, midstate, prehash);
+		
+		for (i = 0; i < 4; i++) {
+			if (swab32(hash[4 * 7 + i]) <= Htarg) {
+				pdata[19] = data[4 * 3 + i];
+				sha256d_80_swap(hash, pdata);
+				if (fulltest(hash, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+#endif /* HAVE_SHA256_4WAY */
+
+#if HAVE_SHA256_8WAY
+
+void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
+	const uint32_t *midstate, const uint32_t *prehash);
+
+static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[8 * 64] __attribute__((aligned(128)));
+	uint32_t hash[8 * 8] __attribute__((aligned(32)));
+	uint32_t midstate[8 * 8] __attribute__((aligned(32)));
+	uint32_t prehash[8 * 8] __attribute__((aligned(32)));
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	int i, j;
+	
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+	for (i = 31; i >= 0; i--)
+		for (j = 0; j < 8; j++)
+			data[i * 8 + j] = data[i];
+	
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+	for (i = 7; i >= 0; i--) {
+		for (j = 0; j < 8; j++) {
+			midstate[i * 8 + j] = midstate[i];
+			prehash[i * 8 + j] = prehash[i];
+		}
+	}
+	
+	do {
+		for (i = 0; i < 8; i++)
+			data[8 * 3 + i] = ++n;
+		
+		sha256d_ms_8way(hash, data, midstate, prehash);
+		
+		for (i = 0; i < 8; i++) {
+			if (swab32(hash[8 * 7 + i]) <= Htarg) {
+				pdata[19] = data[8 * 3 + i];
+				sha256d_80_swap(hash, pdata);
+				if (fulltest(hash, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+#endif /* HAVE_SHA256_8WAY */
+
+int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[64] /* __attribute__((aligned(128))) */;
+	uint32_t hash[8] /* __attribute__((aligned(32))) */;
+	uint32_t midstate[8] /* __attribute__((aligned(32))) */;
+	uint32_t prehash[8] /* __attribute__((aligned(32))) */;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	
+#if HAVE_SHA256_8WAY
+	if (sha256_use_8way())
+		return scanhash_sha256d_8way(thr_id, pdata, ptarget,
+			max_nonce, hashes_done);
+#endif
+#if HAVE_SHA256_4WAY
+	if (sha256_use_4way())
+		return scanhash_sha256d_4way(thr_id, pdata, ptarget,
+			max_nonce, hashes_done);
+#endif
+	
+	memcpy(data, pdata + 16, 64);
+	sha256d_preextend(data);
+	
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	memcpy(prehash, midstate, 32);
+	sha256d_prehash(prehash, pdata + 16);
+	
+	do {
+		data[3] = ++n;
+		sha256d_ms(hash, data, midstate, prehash);
+		if (swab32(hash[7]) <= Htarg) {
+			pdata[19] = data[3];
+			sha256d_80_swap(hash, pdata);
+			if (fulltest(hash, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+				return 1;
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
diff --git a/sph_blake.h b/sph_blake.h
new file mode 100644
index 0000000..0fc4295
--- /dev/null
+++ b/sph_blake.h
@@ -0,0 +1,327 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph_fugue.h b/sph_fugue.h
new file mode 100644
index 0000000..c8ff395
--- /dev/null
+++ b/sph_fugue.h
@@ -0,0 +1,81 @@
+#ifndef SPH_FUGUE_H__
+#define SPH_FUGUE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_fugue224   224
+
+#define SPH_SIZE_fugue256   256
+
+#define SPH_SIZE_fugue384   384
+
+#define SPH_SIZE_fugue512   512
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	sph_u32 partial;
+	unsigned partial_len;
+	unsigned round_shift;
+	sph_u32 S[36];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_fugue_context;
+
+typedef sph_fugue_context sph_fugue224_context;
+
+typedef sph_fugue_context sph_fugue256_context;
+
+typedef sph_fugue_context sph_fugue384_context;
+
+typedef sph_fugue_context sph_fugue512_context;
+
+void sph_fugue224_init(void *cc);
+
+void sph_fugue224(void *cc, const void *data, size_t len);
+
+void sph_fugue224_close(void *cc, void *dst);
+
+void sph_fugue224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue256_init(void *cc);
+
+void sph_fugue256(void *cc, const void *data, size_t len);
+
+void sph_fugue256_close(void *cc, void *dst);
+
+void sph_fugue256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue384_init(void *cc);
+
+void sph_fugue384(void *cc, const void *data, size_t len);
+
+void sph_fugue384_close(void *cc, void *dst);
+
+void sph_fugue384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue512_init(void *cc);
+
+void sph_fugue512(void *cc, const void *data, size_t len);
+
+void sph_fugue512_close(void *cc, void *dst);
+
+void sph_fugue512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
diff --git a/sph_groestl.h b/sph_groestl.h
new file mode 100644
index 0000000..a997431
--- /dev/null
+++ b/sph_groestl.h
@@ -0,0 +1,329 @@
+/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Groestl interface. This code implements Groestl with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_groestl.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_GROESTL_H__
+#define SPH_GROESTL_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Groestl-224.
+ */
+#define SPH_SIZE_groestl224   224
+
+/**
+ * Output size (in bits) for Groestl-256.
+ */
+#define SPH_SIZE_groestl256   256
+
+/**
+ * Output size (in bits) for Groestl-384.
+ */
+#define SPH_SIZE_groestl384   384
+
+/**
+ * Output size (in bits) for Groestl-512.
+ */
+#define SPH_SIZE_groestl512   512
+
+/**
+ * This structure is a context for Groestl-224 and Groestl-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[8];
+#endif
+		sph_u32 narrow[16];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_small_context;
+
+/**
+ * This structure is a context for Groestl-224 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl224_context;
+
+/**
+ * This structure is a context for Groestl-256 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl256_context;
+
+/**
+ * This structure is a context for Groestl-384 and Groestl-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_big_context;
+
+/**
+ * This structure is a context for Groestl-384 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl384_context;
+
+/**
+ * This structure is a context for Groestl-512 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl512_context;
+
+/**
+ * Initialize a Groestl-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-224 context (pointer to a
+ *             <code>sph_groestl224_context</code>)
+ */
+void sph_groestl224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-256 context (pointer to a
+ *             <code>sph_groestl256_context</code>)
+ */
+void sph_groestl256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-384 context (pointer to a
+ *             <code>sph_groestl384_context</code>)
+ */
+void sph_groestl384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-512 context (pointer to a
+ *             <code>sph_groestl512_context</code>)
+ */
+void sph_groestl512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph_keccak.h b/sph_keccak.h
new file mode 100644
index 0000000..8760598
--- /dev/null
+++ b/sph_keccak.h
@@ -0,0 +1,293 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_KECCAK_H__
+#define SPH_KECCAK_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Keccak-224.
+ */
+#define SPH_SIZE_keccak224   224
+
+/**
+ * Output size (in bits) for Keccak-256.
+ */
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-384.
+ */
+#define SPH_SIZE_keccak384   384
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[144];    /* first field, for alignment */
+	size_t ptr, lim;
+	union {
+#if SPH_64
+		sph_u64 wide[25];
+#endif
+		sph_u32 narrow[50];
+	} u;
+#endif
+} sph_keccak_context;
+
+/**
+ * Type for a Keccak-224 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak224_context;
+
+/**
+ * Type for a Keccak-256 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak256_context;
+
+/**
+ * Type for a Keccak-384 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak384_context;
+
+/**
+ * Type for a Keccak-512 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak512_context;
+
+/**
+ * Initialize a Keccak-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-224 context (pointer to a
+ *             <code>sph_keccak224_context</code>)
+ */
+void sph_keccak224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-256 context (pointer to a
+ *             <code>sph_keccak256_context</code>)
+ */
+void sph_keccak256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-384 context (pointer to a
+ *             <code>sph_keccak384_context</code>)
+ */
+void sph_keccak384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-512 context (pointer to a
+ *             <code>sph_keccak512_context</code>)
+ */
+void sph_keccak512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph_types.h b/sph_types.h
new file mode 100644
index 0000000..054c96f
--- /dev/null
+++ b/sph_types.h
@@ -0,0 +1,1976 @@
+/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
+/**
+ * Basic type definitions.
+ *
+ * This header file defines the generic integer types that will be used
+ * for the implementation of hash functions; it also contains helper
+ * functions which encode and decode multi-byte integer values, using
+ * either little-endian or big-endian conventions.
+ *
+ * This file contains a compile-time test on the size of a byte
+ * (the <code>unsigned char</code> C type). If bytes are not octets,
+ * i.e. if they do not have a size of exactly 8 bits, then compilation
+ * is aborted. Architectures where bytes are not octets are relatively
+ * rare, even in the embedded devices market. We forbid non-octet bytes
+ * because there is no clear convention on how octet streams are encoded
+ * on such systems.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_types.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_TYPES_H__
+#define SPH_TYPES_H__
+
+#include <limits.h>
+
+/*
+ * All our I/O functions are defined over octet streams. We do not know
+ * how to handle input data if bytes are not octets.
+ */
+#if CHAR_BIT != 8
+#error This code requires 8-bit bytes
+#endif
+
+/* ============= BEGIN documentation block for Doxygen ============ */
+
+#ifdef DOXYGEN_IGNORE
+
+/** @mainpage sphlib C code documentation
+ *
+ * @section overview Overview
+ *
+ * <code>sphlib</code> is a library which contains implementations of
+ * various cryptographic hash functions. These pages have been generated
+ * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
+ * document the API for the C implementations.
+ *
+ * The API is described in appropriate header files, which are available
+ * in the "Files" section. Each hash function family has its own header,
+ * whose name begins with <code>"sph_"</code> and contains the family
+ * name. For instance, the API for the RIPEMD hash functions is available
+ * in the header file <code>sph_ripemd.h</code>.
+ *
+ * @section principles API structure and conventions
+ *
+ * @subsection io Input/output conventions
+ *
+ * In all generality, hash functions operate over strings of bits.
+ * Individual bits are rarely encountered in C programming or actual
+ * communication protocols; most protocols converge on the ubiquitous
+ * "octet" which is a group of eight bits. Data is thus expressed as a
+ * stream of octets. The C programming language contains the notion of a
+ * "byte", which is a data unit managed under the type <code>"unsigned
+ * char"</code>. The C standard prescribes that a byte should hold at
+ * least eight bits, but possibly more. Most modern architectures, even
+ * in the embedded world, feature eight-bit bytes, i.e. map bytes to
+ * octets.
+ *
+ * Nevertheless, for some of the implemented hash functions, an extra
+ * API has been added, which allows the input of arbitrary sequences of
+ * bits: when the computation is about to be closed, 1 to 7 extra bits
+ * can be added. The functions for which this API is implemented include
+ * the SHA-2 functions and all SHA-3 candidates.
+ *
+ * <code>sphlib</code> defines hash function which may hash octet streams,
+ * i.e. streams of bits where the number of bits is a multiple of eight.
+ * The data input functions in the <code>sphlib</code> API expect data
+ * as anonymous pointers (<code>"const void *"</code>) with a length
+ * (of type <code>"size_t"</code>) which gives the input data chunk length
+ * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
+ * header contains a compile-time test which prevents compilation on
+ * architectures where this property is not met.
+ *
+ * The hash function output is also converted into bytes. All currently
+ * implemented hash functions have an output width which is a multiple of
+ * eight, and this is likely to remain true for new designs.
+ *
+ * Most hash functions internally convert input data into 32-bit of 64-bit
+ * words, using either little-endian or big-endian conversion. The hash
+ * output also often consists of such words, which are encoded into output
+ * bytes with a similar endianness convention. Some hash functions have
+ * been only loosely specified on that subject; when necessary,
+ * <code>sphlib</code> has been tested against published "reference"
+ * implementations in order to use the same conventions.
+ *
+ * @subsection shortname Function short name
+ *
+ * Each implemented hash function has a "short name" which is used
+ * internally to derive the identifiers for the functions and context
+ * structures which the function uses. For instance, MD5 has the short
+ * name <code>"md5"</code>. Short names are listed in the next section,
+ * for the implemented hash functions. In subsequent sections, the
+ * short name will be assumed to be <code>"XXX"</code>: replace with the
+ * actual hash function name to get the C identifier.
+ *
+ * Note: some functions within the same family share the same core
+ * elements, such as update function or context structure. Correspondingly,
+ * some of the defined types or functions may actually be macros which
+ * transparently evaluate to another type or function name.
+ *
+ * @subsection context Context structure
+ *
+ * Each implemented hash fonction has its own context structure, available
+ * under the type name <code>"sph_XXX_context"</code> for the hash function
+ * with short name <code>"XXX"</code>. This structure holds all needed
+ * state for a running hash computation.
+ *
+ * The contents of these structures are meant to be opaque, and private
+ * to the implementation. However, these contents are specified in the
+ * header files so that application code which uses <code>sphlib</code>
+ * may access the size of those structures.
+ *
+ * The caller is responsible for allocating the context structure,
+ * whether by dynamic allocation (<code>malloc()</code> or equivalent),
+ * static allocation (a global permanent variable), as an automatic
+ * variable ("on the stack"), or by any other mean which ensures proper
+ * structure alignment. <code>sphlib</code> code performs no dynamic
+ * allocation by itself.
+ *
+ * The context must be initialized before use, using the
+ * <code>sph_XXX_init()</code> function. This function sets the context
+ * state to proper initial values for hashing.
+ *
+ * Since all state data is contained within the context structure,
+ * <code>sphlib</code> is thread-safe and reentrant: several hash
+ * computations may be performed in parallel, provided that they do not
+ * operate on the same context. Moreover, a running computation can be
+ * cloned by copying the context (with a simple <code>memcpy()</code>):
+ * the context and its clone are then independant and may be updated
+ * with new data and/or closed without interfering with each other.
+ * Similarly, a context structure can be moved in memory at will:
+ * context structures contain no pointer, in particular no pointer to
+ * themselves.
+ *
+ * @subsection dataio Data input
+ *
+ * Hashed data is input with the <code>sph_XXX()</code> fonction, which
+ * takes as parameters a pointer to the context, a pointer to the data
+ * to hash, and the number of data bytes to hash. The context is updated
+ * with the new data.
+ *
+ * Data can be input in one or several calls, with arbitrary input lengths.
+ * However, it is best, performance wise, to input data by relatively big
+ * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
+ * optimize things and avoid internal copying.
+ *
+ * When all data has been input, the context can be closed with
+ * <code>sph_XXX_close()</code>. The hash output is computed and written
+ * into the provided buffer. The caller must take care to provide a
+ * buffer of appropriate length; e.g., when using SHA-1, the output is
+ * a 20-byte word, therefore the output buffer must be at least 20-byte
+ * long.
+ *
+ * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
+ * function can be used instead of <code>sph_XXX_close()</code>. This
+ * function can take a few extra <strong>bits</strong> to be added at
+ * the end of the input message. This allows hashing messages with a
+ * bit length which is not a multiple of 8. The extra bits are provided
+ * as an unsigned integer value, and a bit count. The bit count must be
+ * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
+ * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
+ * For instance, to add three bits of value 1, 1 and 0, the unsigned
+ * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
+ * will be 3.
+ *
+ * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
+ * it evaluates to the function output size, expressed in bits. For instance,
+ * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
+ *
+ * When closed, the context is automatically reinitialized and can be
+ * immediately used for another computation. It is not necessary to call
+ * <code>sph_XXX_init()</code> after a close. Note that
+ * <code>sph_XXX_init()</code> can still be called to "reset" a context,
+ * i.e. forget previously input data, and get back to the initial state.
+ *
+ * @subsection alignment Data alignment
+ *
+ * "Alignment" is a property of data, which is said to be "properly
+ * aligned" when its emplacement in memory is such that the data can
+ * be optimally read by full words. This depends on the type of access;
+ * basically, some hash functions will read data by 32-bit or 64-bit
+ * words. <code>sphlib</code> does not mandate such alignment for input
+ * data, but using aligned data can substantially improve performance.
+ *
+ * As a rule, it is best to input data by chunks whose length (in bytes)
+ * is a multiple of eight, and which begins at "generally aligned"
+ * addresses, such as the base address returned by a call to
+ * <code>malloc()</code>.
+ *
+ * @section functions Implemented functions
+ *
+ * We give here the list of implemented functions. They are grouped by
+ * family; to each family corresponds a specific header file. Each
+ * individual function has its associated "short name". Please refer to
+ * the documentation for that header file to get details on the hash
+ * function denomination and provenance.
+ *
+ * Note: the functions marked with a '(64)' in the list below are
+ * available only if the C compiler provides an integer type of length
+ * 64 bits or more. Such a type is mandatory in the latest C standard
+ * (ISO 9899:1999, aka "C99") and is present in several older compilers
+ * as well, so chances are that such a type is available.
+ *
+ * - HAVAL family: file <code>sph_haval.h</code>
+ *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
+ *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
+ *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
+ *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
+ *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
+ *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
+ *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
+ *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
+ *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
+ *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
+ *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
+ *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
+ *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
+ *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
+ *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
+ * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
+ * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
+ * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
+ * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
+ * - RadioGatun family: file <code>sph_radiogatun.h</code>
+ *   - RadioGatun[32]: short name: <code>radiogatun32</code>
+ *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
+ * - RIPEMD family: file <code>sph_ripemd.h</code>
+ *   - RIPEMD: short name: <code>ripemd</code>
+ *   - RIPEMD-128: short name: <code>ripemd128</code>
+ *   - RIPEMD-160: short name: <code>ripemd160</code>
+ * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
+ * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
+ * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
+ *   - SHA-224: short name: <code>sha224</code>
+ *   - SHA-256: short name: <code>sha256</code>
+ *   - SHA-384: short name: <code>sha384</code> (64)
+ *   - SHA-512: short name: <code>sha512</code> (64)
+ * - Tiger family: file <code>sph_tiger.h</code>
+ *   - Tiger: short name: <code>tiger</code> (64)
+ *   - Tiger2: short name: <code>tiger2</code> (64)
+ * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
+ *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
+ *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
+ *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
+ *
+ * The fourteen second-round SHA-3 candidates are also implemented;
+ * when applicable, the implementations follow the "final" specifications
+ * as published for the third round of the SHA-3 competition (BLAKE,
+ * Groestl, JH, Keccak and Skein have been tweaked for third round).
+ *
+ * - BLAKE family: file <code>sph_blake.h</code>
+ *   - BLAKE-224: short name: <code>blake224</code>
+ *   - BLAKE-256: short name: <code>blake256</code>
+ *   - BLAKE-384: short name: <code>blake384</code>
+ *   - BLAKE-512: short name: <code>blake512</code>
+ * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
+ *   - BMW-224: short name: <code>bmw224</code>
+ *   - BMW-256: short name: <code>bmw256</code>
+ *   - BMW-384: short name: <code>bmw384</code> (64)
+ *   - BMW-512: short name: <code>bmw512</code> (64)
+ * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
+ *   CubeHash16/32 in the CubeHash specification)
+ *   - CubeHash-224: short name: <code>cubehash224</code>
+ *   - CubeHash-256: short name: <code>cubehash256</code>
+ *   - CubeHash-384: short name: <code>cubehash384</code>
+ *   - CubeHash-512: short name: <code>cubehash512</code>
+ * - ECHO family: file <code>sph_echo.h</code>
+ *   - ECHO-224: short name: <code>echo224</code>
+ *   - ECHO-256: short name: <code>echo256</code>
+ *   - ECHO-384: short name: <code>echo384</code>
+ *   - ECHO-512: short name: <code>echo512</code>
+ * - Fugue family: file <code>sph_fugue.h</code>
+ *   - Fugue-224: short name: <code>fugue224</code>
+ *   - Fugue-256: short name: <code>fugue256</code>
+ *   - Fugue-384: short name: <code>fugue384</code>
+ *   - Fugue-512: short name: <code>fugue512</code>
+ * - Groestl family: file <code>sph_groestl.h</code>
+ *   - Groestl-224: short name: <code>groestl224</code>
+ *   - Groestl-256: short name: <code>groestl256</code>
+ *   - Groestl-384: short name: <code>groestl384</code>
+ *   - Groestl-512: short name: <code>groestl512</code>
+ * - Hamsi family: file <code>sph_hamsi.h</code>
+ *   - Hamsi-224: short name: <code>hamsi224</code>
+ *   - Hamsi-256: short name: <code>hamsi256</code>
+ *   - Hamsi-384: short name: <code>hamsi384</code>
+ *   - Hamsi-512: short name: <code>hamsi512</code>
+ * - JH family: file <code>sph_jh.h</code>
+ *   - JH-224: short name: <code>jh224</code>
+ *   - JH-256: short name: <code>jh256</code>
+ *   - JH-384: short name: <code>jh384</code>
+ *   - JH-512: short name: <code>jh512</code>
+ * - Keccak family: file <code>sph_keccak.h</code>
+ *   - Keccak-224: short name: <code>keccak224</code>
+ *   - Keccak-256: short name: <code>keccak256</code>
+ *   - Keccak-384: short name: <code>keccak384</code>
+ *   - Keccak-512: short name: <code>keccak512</code>
+ * - Luffa family: file <code>sph_luffa.h</code>
+ *   - Luffa-224: short name: <code>luffa224</code>
+ *   - Luffa-256: short name: <code>luffa256</code>
+ *   - Luffa-384: short name: <code>luffa384</code>
+ *   - Luffa-512: short name: <code>luffa512</code>
+ * - Shabal family: file <code>sph_shabal.h</code>
+ *   - Shabal-192: short name: <code>shabal192</code>
+ *   - Shabal-224: short name: <code>shabal224</code>
+ *   - Shabal-256: short name: <code>shabal256</code>
+ *   - Shabal-384: short name: <code>shabal384</code>
+ *   - Shabal-512: short name: <code>shabal512</code>
+ * - SHAvite-3 family: file <code>sph_shavite.h</code>
+ *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
+ *     short name: <code>shabal224</code>
+ *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
+ *     short name: <code>shabal256</code>
+ *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
+ *     short name: <code>shabal384</code>
+ *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
+ *     short name: <code>shabal512</code>
+ * - SIMD family: file <code>sph_simd.h</code>
+ *   - SIMD-224: short name: <code>simd224</code>
+ *   - SIMD-256: short name: <code>simd256</code>
+ *   - SIMD-384: short name: <code>simd384</code>
+ *   - SIMD-512: short name: <code>simd512</code>
+ * - Skein family: file <code>sph_skein.h</code>
+ *   - Skein-224 (nominally specified as Skein-512-224): short name:
+ *     <code>skein224</code> (64)
+ *   - Skein-256 (nominally specified as Skein-512-256): short name:
+ *     <code>skein256</code> (64)
+ *   - Skein-384 (nominally specified as Skein-512-384): short name:
+ *     <code>skein384</code> (64)
+ *   - Skein-512 (nominally specified as Skein-512-512): short name:
+ *     <code>skein512</code> (64)
+ *
+ * For the second-round SHA-3 candidates, the functions are as specified
+ * for round 2, i.e. with the "tweaks" that some candidates added
+ * between round 1 and round 2. Also, some of the submitted packages for
+ * round 2 contained errors, in the specification, reference code, or
+ * both. <code>sphlib</code> implements the corrected versions.
+ */
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 32 bits; on most
+ * architectures, it will have a width of exactly 32 bits. Unsigned C
+ * types implement arithmetics modulo a power of 2; use the
+ * <code>SPH_T32()</code> macro to ensure that the value is truncated
+ * to exactly 32 bits. Unless otherwise specified, all macros and
+ * functions which accept <code>sph_u32</code> values assume that these
+ * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
+ * where <code>sph_u32</code> is larger than that.
+ */
+typedef __arch_dependant__ sph_u32;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u32</code>; it has
+ * width 32 bits or more.
+ */
+typedef __arch_dependant__ sph_s32;
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 64 bits; on most
+ * architectures which feature such a type, it will have a width of
+ * exactly 64 bits. C99-compliant platform will have this type; it
+ * is also defined when the GNU compiler (gcc) is used, and on
+ * platforms where <code>unsigned long</code> is large enough. If this
+ * type is not available, then some hash functions which depends on
+ * a 64-bit type will not be available (most notably SHA-384, SHA-512,
+ * Tiger and WHIRLPOOL).
+ */
+typedef __arch_dependant__ sph_u64;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u64</code>; it has
+ * width 64 bits or more.
+ */
+typedef __arch_dependant__ sph_s64;
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u32</code>. Depending on
+ * how this type is defined, a suffix such as <code>UL</code> may
+ * be appended to the argument.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C32(x)
+
+/**
+ * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler.
+ *
+ * @param x   the value to truncate (of type <code>sph_u32</code>)
+ */
+#define SPH_T32(x)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTL32(x, n)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTR32(x, n)
+
+/**
+ * This macro is defined on systems for which a 64-bit type has been
+ * detected, and is used for <code>sph_u64</code>.
+ */
+#define SPH_64
+
+/**
+ * This macro is defined on systems for the "native" integer size is
+ * 64 bits (64-bit values fit in one register).
+ */
+#define SPH_64_TRUE
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u64</code>. Depending on
+ * how this type is defined, a suffix such as <code>ULL</code> may
+ * be appended to the argument. This macro is defined only if a
+ * 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C64(x)
+
+/**
+ * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler. This macro is defined only
+ * if a 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to truncate (of type <code>sph_u64</code>)
+ */
+#define SPH_T64(x)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTL64(x, n)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTR64(x, n)
+
+/**
+ * This macro evaluates to <code>inline</code> or an equivalent construction,
+ * if available on the compilation platform, or to nothing otherwise. This
+ * is used to declare inline functions, for which the compiler should
+ * endeavour to include the code directly in the caller. Inline functions
+ * are typically defined in header files as replacement for macros.
+ */
+#define SPH_INLINE
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * little-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_LITTLE_ENDIAN
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * big-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_BIG_ENDIAN
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in little-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the big-endian platforms which have special little-endian access
+ * opcodes (e.g. Ultrasparc).
+ */
+#define SPH_LITTLE_FAST
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in big-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the little-endian platforms which have special big-endian access
+ * opcodes.
+ */
+#define SPH_BIG_FAST
+
+/**
+ * On some platforms, this macro is defined to an unsigned integer type
+ * into which pointer values may be cast. The resulting value can then
+ * be tested for being a multiple of 2, 4 or 8, indicating an aligned
+ * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
+ */
+#define SPH_UPTR
+
+/**
+ * When defined, this macro indicates that unaligned memory accesses
+ * are possible with only a minor penalty, and thus should be prefered
+ * over strategies which first copy data to an aligned buffer.
+ */
+#define SPH_UNALIGNED
+
+/**
+ * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
+ * <code>0x78563412</code>). This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance.
+ *
+ * @param x   the 32-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u32 sph_bswap32(sph_u32 x);
+
+/**
+ * Byte-swap a 64-bit word. This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance. This
+ * function is defined only if a suitable 64-bit type was found for
+ * <code>sph_u64</code>
+ *
+ * @param x   the 64-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u64 sph_bswap64(sph_u64 x);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16le(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16le(void *dst, unsigned val);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16be(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16be(void *dst, unsigned val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32le()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32le()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32be()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32be()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64le()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64le()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64be()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64be()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
+
+#endif
+
+/* ============== END documentation block for Doxygen ============= */
+
+#ifndef DOXYGEN_IGNORE
+
+/*
+ * We want to define the types "sph_u32" and "sph_u64" which hold
+ * unsigned values of at least, respectively, 32 and 64 bits. These
+ * tests should select appropriate types for most platforms. The
+ * macro "SPH_64" is defined if the 64-bit is supported.
+ */
+
+#undef SPH_64
+#undef SPH_64_TRUE
+
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+
+/*
+ * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
+ * type, if any, or otherwise use a wider type (which must exist, for
+ * C99 conformance).
+ */
+
+#include <stdint.h>
+
+#ifdef UINT32_MAX
+typedef uint32_t sph_u32;
+typedef int32_t sph_s32;
+#else
+typedef uint_fast32_t sph_u32;
+typedef int_fast32_t sph_s32;
+#endif
+#if !SPH_NO_64
+#ifdef UINT64_MAX
+typedef uint64_t sph_u64;
+typedef int64_t sph_s64;
+#else
+typedef uint_fast64_t sph_u64;
+typedef int_fast64_t sph_s64;
+#endif
+#endif
+
+#define SPH_C32(x)    ((sph_u32)(x))
+#if !SPH_NO_64
+#define SPH_C64(x)    ((sph_u64)(x))
+#define SPH_64  1
+#endif
+
+#else
+
+/*
+ * On non-C99 systems, we use "unsigned int" if it is wide enough,
+ * "unsigned long" otherwise. This supports all "reasonable" architectures.
+ * We have to be cautious: pre-C99 preprocessors handle constants
+ * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
+ */
+
+#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+
+#else
+
+typedef unsigned long sph_u32;
+typedef long sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## UL))
+
+#endif
+
+#if !SPH_NO_64
+
+/*
+ * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
+ * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
+ * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
+ * test whether "unsigned long long" is available; we also know that
+ * gcc features this type, even if the libc header do not know it.
+ */
+
+#if ((ULONG_MAX >> 31) >> 31) >= 3
+
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+
+#define SPH_64  1
+
+#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
+
+typedef unsigned long long sph_u64;
+typedef long long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## ULL))
+
+#define SPH_64  1
+
+#else
+
+/*
+ * No 64-bit type...
+ */
+
+#endif
+
+#endif
+
+#endif
+
+/*
+ * If the "unsigned long" type has length 64 bits or more, then this is
+ * a "true" 64-bit architectures. This is also true with Visual C on
+ * amd64, even though the "long" type is limited to 32 bits.
+ */
+#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
+#define SPH_64_TRUE   1
+#endif
+
+/*
+ * Implementation note: some processors have specific opcodes to perform
+ * a rotation. Recent versions of gcc recognize the expression above and
+ * use the relevant opcodes, when appropriate.
+ */
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#if SPH_64
+
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
+
+#endif
+
+#ifndef DOXYGEN_IGNORE
+/*
+ * Define SPH_INLINE to be an "inline" qualifier, if available. We define
+ * some small macro-like functions which benefit greatly from being inlined.
+ */
+#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
+#define SPH_INLINE inline
+#elif defined _MSC_VER
+#define SPH_INLINE __inline
+#else
+#define SPH_INLINE
+#endif
+#endif
+
+/*
+ * We define some macros which qualify the architecture. These macros
+ * may be explicit set externally (e.g. as compiler parameters). The
+ * code below sets those macros if they are not already defined.
+ *
+ * Most macros are boolean, thus evaluate to either zero or non-zero.
+ * The SPH_UPTR macro is special, in that it evaluates to a C type,
+ * or is not defined.
+ *
+ * SPH_UPTR             if defined: unsigned type to cast pointers into
+ *
+ * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
+ * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
+ * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
+ * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
+ * SPH_BIG_FAST         non-zero if big-endian decoding is fast
+ *
+ * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
+ * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
+ * _must_ be non-zero in those situations. The 32-bit and 64-bit types
+ * _must_ also have an exact width.
+ *
+ * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
+ * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
+ * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
+ * SPH_I386_GCC         x86-compatible (32-bit) with gcc
+ * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
+ * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
+ * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
+ * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
+ * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
+ *
+ * TODO: enhance automatic detection, for more architectures and compilers.
+ * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
+ * some very fast functions (e.g. MD4) when using unaligned input data.
+ * The CPU-specific-with-GCC macros are useful only for inline assembly,
+ * normally restrained to this header file.
+ */
+
+/*
+ * 32-bit x86, aka "i386 compatible".
+ */
+#if defined __i386__ || defined _M_IX86
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u32
+#ifdef __GNUC__
+#define SPH_DETECT_I386_GCC          1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_I386_MSVC         1
+#endif
+
+/*
+ * 64-bit x86, hereafter known as "amd64".
+ */
+#elif defined __x86_64 || defined _M_X64
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_AMD64_GCC         1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_AMD64_MSVC        1
+#endif
+
+/*
+ * 64-bit Sparc architecture (implies v9).
+ */
+#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
+	|| defined __sparcv9
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_SPARCV9_GCC_64    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * 32-bit Sparc.
+ */
+#elif (defined __sparc__ || defined __sparc) \
+	&& !(defined __sparcv9 || defined __arch64__)
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u32
+#if defined __GNUC__ && defined __sparc_v9__
+#define SPH_DETECT_SPARCV9_GCC_32    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * ARM, little-endian.
+ */
+#elif defined __arm__ && __ARMEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, little-endian.
+ */
+#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, big-endian.
+ */
+#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
+
+#define SPH_DETECT_BIG_ENDIAN        1
+
+/*
+ * PowerPC.
+ */
+#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
+	|| defined _ARCH_PPC
+
+/*
+ * Note: we do not declare cross-endian access to be "fast": even if
+ * using inline assembly, implementation should still assume that
+ * keeping the decoded word in a temporary is faster than decoding
+ * it again.
+ */
+#if defined __GNUC__
+#if SPH_64_TRUE
+#define SPH_DETECT_PPC64_GCC         1
+#else
+#define SPH_DETECT_PPC32_GCC         1
+#endif
+#endif
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+
+/*
+ * Itanium, 64-bit.
+ */
+#elif defined __ia64 || defined __ia64__ \
+	|| defined __itanium__ || defined _M_IA64
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#else
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+#if defined __LP64__ || defined _LP64
+#define SPH_DETECT_UPTR              sph_u64
+#else
+#define SPH_DETECT_UPTR              sph_u32
+#endif
+
+#endif
+
+#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
+#define SPH_DETECT_SPARCV9_GCC       1
+#endif
+
+#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
+#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
+#endif
+#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
+#define SPH_UPTR              SPH_DETECT_UPTR
+#endif
+#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
+#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
+#endif
+#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
+#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
+#endif
+#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
+#endif
+#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
+#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
+#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
+#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
+#endif
+#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
+#define SPH_I386_GCC          SPH_DETECT_I386_GCC
+#endif
+#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
+#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
+#endif
+#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
+#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
+#endif
+#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
+#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
+#endif
+#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
+#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
+#endif
+#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
+#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
+#endif
+
+#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST              1
+#endif
+#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST                 1
+#endif
+
+#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
+#error SPH_UPTR defined, but endianness is not known.
+#endif
+
+#if SPH_I386_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+#elif SPH_AMD64_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * and 64-bit values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#endif
+
+/*
+ * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
+ * to generate proper opcodes for endianness swapping with the pure C
+ * implementation below.
+ *
+
+#elif SPH_I386_MSVC && !SPH_NO_ASM
+
+static __inline sph_u32 __declspec(naked) __fastcall
+sph_bswap32(sph_u32 x)
+{
+	__asm {
+		bswap  ecx
+		mov    eax,ecx
+		ret
+	}
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+ *
+ * [end of disabled code]
+ */
+
+#else
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	x = SPH_T32((x << 16) | (x >> 16));
+	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
+		| ((x & SPH_C32(0x00FF00FF)) << 8);
+	return x;
+}
+
+#if SPH_64
+
+/**
+ * Byte-swap a 64-bit value.
+ *
+ * @param x   the input value
+ * @return  the byte-swapped value
+ */
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	x = SPH_T64((x << 32) | (x >> 32));
+	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
+		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
+	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
+		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
+	return x;
+}
+
+#endif
+
+#endif
+
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+
+/*
+ * On UltraSPARC systems, native ordering is big-endian, but it is
+ * possible to perform little-endian read accesses by specifying the
+ * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
+ * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
+ * contains the source address and %dst is the destination register,
+ * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
+ * to get the address space name. The latter format is better since it
+ * combines an addition and the actual access in a single opcode; but
+ * it requires the setting (and subsequent resetting) of %asi, which is
+ * slow. Some operations (i.e. MD5 compression function) combine many
+ * successive little-endian read accesses, which may share the same
+ * %asi setting. The macros below contain the appropriate inline
+ * assembly.
+ */
+
+#define SPH_SPARCV9_SET_ASI   \
+	sph_u32 sph_sparcv9_asi; \
+	__asm__ __volatile__ ( \
+		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_RESET_ASI  \
+	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
+		sph_u32 sph_sparcv9_tmp; \
+		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
+			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
+		sph_sparcv9_tmp; \
+	})
+
+#endif
+
+static SPH_INLINE void
+sph_enc16be(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = (val >> 8);
+	((unsigned char *)dst)[1] = val;
+}
+
+static SPH_INLINE unsigned
+sph_dec16be(const void *src)
+{
+	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
+		| (unsigned)(((const unsigned char *)src)[1]);
+}
+
+static SPH_INLINE void
+sph_enc16le(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = val >> 8;
+}
+
+static SPH_INLINE unsigned
+sph_dec16le(const void *src)
+{
+	return (unsigned)(((const unsigned char *)src)[0])
+		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32be(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 24);
+		((unsigned char *)dst)[1] = (val >> 16);
+		((unsigned char *)dst)[2] = (val >> 8);
+		((unsigned char *)dst)[3] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32be_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap32(*(const sph_u32 *)src);
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+			| (sph_u32)(((const unsigned char *)src)[3]);
+	}
+#endif
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u32 *)src;
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32le(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32le_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		/*
+		 * "__volatile__" is needed here because without it,
+		 * gcc-3.4.3 miscompiles the code and performs the
+		 * access before the test on the address, thus triggering
+		 * a bus error...
+		 */
+		__asm__ __volatile__ (
+			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * On PowerPC, this turns out not to be worth the effort: the inline
+ * assembly makes GCC optimizer uncomfortable, which tends to nullify
+ * the decoding gains.
+ *
+ * For most hash functions, using this inline assembly trick changes
+ * hashing speed by less than 5% and often _reduces_ it. The biggest
+ * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
+ * less then 10%. The speed gain on CubeHash is probably due to the
+ * chronic shortage of registers that CubeHash endures; for the other
+ * functions, the generic code appears to be efficient enough already.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		__asm__ __volatile__ (
+			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return (sph_u32)(((const unsigned char *)src)[0])
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+	}
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u32 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+#if SPH_64
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64be(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 56);
+		((unsigned char *)dst)[1] = (val >> 48);
+		((unsigned char *)dst)[2] = (val >> 40);
+		((unsigned char *)dst)[3] = (val >> 32);
+		((unsigned char *)dst)[4] = (val >> 24);
+		((unsigned char *)dst)[5] = (val >> 16);
+		((unsigned char *)dst)[6] = (val >> 8);
+		((unsigned char *)dst)[7] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64be_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap64(*(const sph_u64 *)src);
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+			| (sph_u64)(((const unsigned char *)src)[7]);
+	}
+#endif
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u64 *)src;
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64le(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+		((unsigned char *)dst)[4] = (val >> 32);
+		((unsigned char *)dst)[5] = (val >> 40);
+		((unsigned char *)dst)[6] = (val >> 48);
+		((unsigned char *)dst)[7] = (val >> 56);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64le_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+		return (sph_u64)sph_dec32le_aligned(src)
+			| ((sph_u64)sph_dec32le_aligned(
+				(const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return (sph_u64)(((const unsigned char *)src)[0])
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+	}
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u64 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+	return (sph_u64)sph_dec32le_aligned(src)
+		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+#endif
+
+#endif /* Doxygen excluded block */
+
+#endif
diff --git a/uint256.h b/uint256.h
new file mode 100644
index 0000000..2a252c9
--- /dev/null
+++ b/uint256.h
@@ -0,0 +1,784 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2012 The Bitcoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+#ifndef BITCOIN_UINT256_H
+#define BITCOIN_UINT256_H
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <string>
+#include <vector>
+
+typedef long long  int64;
+typedef unsigned long long  uint64;
+
+
+inline int Testuint256AdHoc(std::vector<std::string> vArg);
+
+
+
+/** Base class without constructors for uint256 and uint160.
+ * This makes the compiler let you use it in a union.
+ */
+template<unsigned int BITS>
+class base_uint
+{
+protected:
+    enum { WIDTH=BITS/32 };
+    uint32_t pn[WIDTH];
+public:
+
+    bool operator!() const
+    {
+        for (int i = 0; i < WIDTH; i++)
+            if (pn[i] != 0)
+                return false;
+        return true;
+    }
+
+    const base_uint operator~() const
+    {
+        base_uint ret;
+        for (int i = 0; i < WIDTH; i++)
+            ret.pn[i] = ~pn[i];
+        return ret;
+    }
+
+    const base_uint operator-() const
+    {
+        base_uint ret;
+        for (int i = 0; i < WIDTH; i++)
+            ret.pn[i] = ~pn[i];
+        ret++;
+        return ret;
+    }
+
+    double getdouble() const
+    {
+        double ret = 0.0;
+        double fact = 1.0;
+        for (int i = 0; i < WIDTH; i++) {
+            ret += fact * pn[i];
+            fact *= 4294967296.0;
+        }
+        return ret;
+    }
+
+    base_uint& operator=(uint64 b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+        return *this;
+    }
+
+    base_uint& operator^=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] ^= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator&=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] &= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator|=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] |= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator^=(uint64 b)
+    {
+        pn[0] ^= (unsigned int)b;
+        pn[1] ^= (unsigned int)(b >> 32);
+        return *this;
+    }
+
+    base_uint& operator|=(uint64 b)
+    {
+        pn[0] |= (unsigned int)b;
+        pn[1] |= (unsigned int)(b >> 32);
+        return *this;
+    }
+
+    base_uint& operator<<=(unsigned int shift)
+    {
+        base_uint a(*this);
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+        int k = shift / 32;
+        shift = shift % 32;
+        for (int i = 0; i < WIDTH; i++)
+        {
+            if (i+k+1 < WIDTH && shift != 0)
+                pn[i+k+1] |= (a.pn[i] >> (32-shift));
+            if (i+k < WIDTH)
+                pn[i+k] |= (a.pn[i] << shift);
+        }
+        return *this;
+    }
+
+    base_uint& operator>>=(unsigned int shift)
+    {
+        base_uint a(*this);
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+        int k = shift / 32;
+        shift = shift % 32;
+        for (int i = 0; i < WIDTH; i++)
+        {
+            if (i-k-1 >= 0 && shift != 0)
+                pn[i-k-1] |= (a.pn[i] << (32-shift));
+            if (i-k >= 0)
+                pn[i-k] |= (a.pn[i] >> shift);
+        }
+        return *this;
+    }
+
+    base_uint& operator+=(const base_uint& b)
+    {
+        uint64 carry = 0;
+        for (int i = 0; i < WIDTH; i++)
+        {
+            uint64 n = carry + pn[i] + b.pn[i];
+            pn[i] = n & 0xffffffff;
+            carry = n >> 32;
+        }
+        return *this;
+    }
+
+    base_uint& operator-=(const base_uint& b)
+    {
+        *this += -b;
+        return *this;
+    }
+
+    base_uint& operator+=(uint64 b64)
+    {
+        base_uint b;
+        b = b64;
+        *this += b;
+        return *this;
+    }
+
+    base_uint& operator-=(uint64 b64)
+    {
+        base_uint b;
+        b = b64;
+        *this += -b;
+        return *this;
+    }
+
+
+    base_uint& operator++()
+    {
+        // prefix operator
+        int i = 0;
+        while (++pn[i] == 0 && i < WIDTH-1)
+            i++;
+        return *this;
+    }
+
+    const base_uint operator++(int)
+    {
+        // postfix operator
+        const base_uint ret = *this;
+        ++(*this);
+        return ret;
+    }
+
+    base_uint& operator--()
+    {
+        // prefix operator
+        int i = 0;
+        while (--pn[i] == -1 && i < WIDTH-1)
+            i++;
+        return *this;
+    }
+
+    const base_uint operator--(int)
+    {
+        // postfix operator
+        const base_uint ret = *this;
+        --(*this);
+        return ret;
+    }
+
+
+    friend inline bool operator<(const base_uint& a, const base_uint& b)
+    {
+        for (int i = base_uint::WIDTH-1; i >= 0; i--)
+        {
+            if (a.pn[i] < b.pn[i])
+                return true;
+            else if (a.pn[i] > b.pn[i])
+                return false;
+        }
+        return false;
+    }
+
+    friend inline bool operator<=(const base_uint& a, const base_uint& b)
+    {
+        for (int i = base_uint::WIDTH-1; i >= 0; i--)
+        {
+            if (a.pn[i] < b.pn[i])
+                return true;
+            else if (a.pn[i] > b.pn[i])
+                return false;
+        }
+        return true;
+    }
+
+    friend inline bool operator>(const base_uint& a, const base_uint& b)
+    {
+        for (int i = base_uint::WIDTH-1; i >= 0; i--)
+        {
+            if (a.pn[i] > b.pn[i])
+                return true;
+            else if (a.pn[i] < b.pn[i])
+                return false;
+        }
+        return false;
+    }
+
+    friend inline bool operator>=(const base_uint& a, const base_uint& b)
+    {
+        for (int i = base_uint::WIDTH-1; i >= 0; i--)
+        {
+            if (a.pn[i] > b.pn[i])
+                return true;
+            else if (a.pn[i] < b.pn[i])
+                return false;
+        }
+        return true;
+    }
+
+    friend inline bool operator==(const base_uint& a, const base_uint& b)
+    {
+        for (int i = 0; i < base_uint::WIDTH; i++)
+            if (a.pn[i] != b.pn[i])
+                return false;
+        return true;
+    }
+
+    friend inline bool operator==(const base_uint& a, uint64 b)
+    {
+        if (a.pn[0] != (unsigned int)b)
+            return false;
+        if (a.pn[1] != (unsigned int)(b >> 32))
+            return false;
+        for (int i = 2; i < base_uint::WIDTH; i++)
+            if (a.pn[i] != 0)
+                return false;
+        return true;
+    }
+
+    friend inline bool operator!=(const base_uint& a, const base_uint& b)
+    {
+        return (!(a == b));
+    }
+
+    friend inline bool operator!=(const base_uint& a, uint64 b)
+    {
+        return (!(a == b));
+    }
+
+
+
+    std::string GetHex() const
+    {
+        char psz[sizeof(pn)*2 + 1];
+        for (unsigned int i = 0; i < sizeof(pn); i++)
+            sprintf(psz + i*2, "%02x", ((unsigned char*)pn)[sizeof(pn) - i - 1]);
+        return std::string(psz, psz + sizeof(pn)*2);
+    }
+
+    void SetHex(const char* psz)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+
+        // skip leading spaces
+        while (isspace(*psz))
+            psz++;
+
+        // skip 0x
+        if (psz[0] == '0' && tolower(psz[1]) == 'x')
+            psz += 2;
+
+        // hex string to uint
+        static const unsigned char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
+        const char* pbegin = psz;
+        while (phexdigit[(unsigned char)*psz] || *psz == '0')
+            psz++;
+        psz--;
+        unsigned char* p1 = (unsigned char*)pn;
+        unsigned char* pend = p1 + WIDTH * 4;
+        while (psz >= pbegin && p1 < pend)
+        {
+            *p1 = phexdigit[(unsigned char)*psz--];
+            if (psz >= pbegin)
+            {
+                *p1 |= (phexdigit[(unsigned char)*psz--] << 4);
+                p1++;
+            }
+        }
+    }
+
+    void SetHex(const std::string& str)
+    {
+        SetHex(str.c_str());
+    }
+
+    std::string ToString() const
+    {
+        return (GetHex());
+    }
+
+    unsigned char* begin()
+    {
+        return (unsigned char*)&pn[0];
+    }
+
+    unsigned char* end()
+    {
+        return (unsigned char*)&pn[WIDTH];
+    }
+
+    const unsigned char* begin() const
+    {
+        return (unsigned char*)&pn[0];
+    }
+
+    const unsigned char* end() const
+    {
+        return (unsigned char*)&pn[WIDTH];
+    }
+
+    unsigned int size() const
+    {
+        return sizeof(pn);
+    }
+
+    uint64 Get64(int n=0) const
+    {
+        return pn[2*n] | (uint64)pn[2*n+1] << 32;
+    }
+
+//    unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const
+    unsigned int GetSerializeSize(int nType, int nVersion) const
+    {
+        return sizeof(pn);
+    }
+
+    template<typename Stream>
+//    void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const
+    void Serialize(Stream& s, int nType, int nVersion) const
+    {
+        s.write((char*)pn, sizeof(pn));
+    }
+
+    template<typename Stream>
+//    void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION)
+    void Unserialize(Stream& s, int nType, int nVersion)
+    {
+        s.read((char*)pn, sizeof(pn));
+    }
+
+
+    friend class uint160;
+    friend class uint256;
+    friend inline int Testuint256AdHoc(std::vector<std::string> vArg);
+};
+
+typedef base_uint<160> base_uint160;
+typedef base_uint<256> base_uint256;
+
+
+
+//
+// uint160 and uint256 could be implemented as templates, but to keep
+// compile errors and debugging cleaner, they're copy and pasted.
+//
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// uint160
+//
+
+/** 160-bit unsigned integer */
+class uint160 : public base_uint160
+{
+public:
+    typedef base_uint160 basetype;
+
+    uint160()
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    uint160(const basetype& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+    }
+
+    uint160& operator=(const basetype& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+        return *this;
+    }
+
+    uint160(uint64 b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    uint160& operator=(uint64 b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+        return *this;
+    }
+
+    explicit uint160(const std::string& str)
+    {
+        SetHex(str);
+    }
+
+    explicit uint160(const std::vector<unsigned char>& vch)
+    {
+        if (vch.size() == sizeof(pn))
+            memcpy(pn, &vch[0], sizeof(pn));
+        else
+            *this = 0;
+    }
+};
+
+inline bool operator==(const uint160& a, uint64 b)                           { return (base_uint160)a == b; }
+inline bool operator!=(const uint160& a, uint64 b)                           { return (base_uint160)a != b; }
+inline const uint160 operator<<(const base_uint160& a, unsigned int shift)   { return uint160(a) <<= shift; }
+inline const uint160 operator>>(const base_uint160& a, unsigned int shift)   { return uint160(a) >>= shift; }
+inline const uint160 operator<<(const uint160& a, unsigned int shift)        { return uint160(a) <<= shift; }
+inline const uint160 operator>>(const uint160& a, unsigned int shift)        { return uint160(a) >>= shift; }
+
+inline const uint160 operator^(const base_uint160& a, const base_uint160& b) { return uint160(a) ^= b; }
+inline const uint160 operator&(const base_uint160& a, const base_uint160& b) { return uint160(a) &= b; }
+inline const uint160 operator|(const base_uint160& a, const base_uint160& b) { return uint160(a) |= b; }
+inline const uint160 operator+(const base_uint160& a, const base_uint160& b) { return uint160(a) += b; }
+inline const uint160 operator-(const base_uint160& a, const base_uint160& b) { return uint160(a) -= b; }
+
+inline bool operator<(const base_uint160& a, const uint160& b)          { return (base_uint160)a <  (base_uint160)b; }
+inline bool operator<=(const base_uint160& a, const uint160& b)         { return (base_uint160)a <= (base_uint160)b; }
+inline bool operator>(const base_uint160& a, const uint160& b)          { return (base_uint160)a >  (base_uint160)b; }
+inline bool operator>=(const base_uint160& a, const uint160& b)         { return (base_uint160)a >= (base_uint160)b; }
+inline bool operator==(const base_uint160& a, const uint160& b)         { return (base_uint160)a == (base_uint160)b; }
+inline bool operator!=(const base_uint160& a, const uint160& b)         { return (base_uint160)a != (base_uint160)b; }
+inline const uint160 operator^(const base_uint160& a, const uint160& b) { return (base_uint160)a ^  (base_uint160)b; }
+inline const uint160 operator&(const base_uint160& a, const uint160& b) { return (base_uint160)a &  (base_uint160)b; }
+inline const uint160 operator|(const base_uint160& a, const uint160& b) { return (base_uint160)a |  (base_uint160)b; }
+inline const uint160 operator+(const base_uint160& a, const uint160& b) { return (base_uint160)a +  (base_uint160)b; }
+inline const uint160 operator-(const base_uint160& a, const uint160& b) { return (base_uint160)a -  (base_uint160)b; }
+
+inline bool operator<(const uint160& a, const base_uint160& b)          { return (base_uint160)a <  (base_uint160)b; }
+inline bool operator<=(const uint160& a, const base_uint160& b)         { return (base_uint160)a <= (base_uint160)b; }
+inline bool operator>(const uint160& a, const base_uint160& b)          { return (base_uint160)a >  (base_uint160)b; }
+inline bool operator>=(const uint160& a, const base_uint160& b)         { return (base_uint160)a >= (base_uint160)b; }
+inline bool operator==(const uint160& a, const base_uint160& b)         { return (base_uint160)a == (base_uint160)b; }
+inline bool operator!=(const uint160& a, const base_uint160& b)         { return (base_uint160)a != (base_uint160)b; }
+inline const uint160 operator^(const uint160& a, const base_uint160& b) { return (base_uint160)a ^  (base_uint160)b; }
+inline const uint160 operator&(const uint160& a, const base_uint160& b) { return (base_uint160)a &  (base_uint160)b; }
+inline const uint160 operator|(const uint160& a, const base_uint160& b) { return (base_uint160)a |  (base_uint160)b; }
+inline const uint160 operator+(const uint160& a, const base_uint160& b) { return (base_uint160)a +  (base_uint160)b; }
+inline const uint160 operator-(const uint160& a, const base_uint160& b) { return (base_uint160)a -  (base_uint160)b; }
+
+inline bool operator<(const uint160& a, const uint160& b)               { return (base_uint160)a <  (base_uint160)b; }
+inline bool operator<=(const uint160& a, const uint160& b)              { return (base_uint160)a <= (base_uint160)b; }
+inline bool operator>(const uint160& a, const uint160& b)               { return (base_uint160)a >  (base_uint160)b; }
+inline bool operator>=(const uint160& a, const uint160& b)              { return (base_uint160)a >= (base_uint160)b; }
+inline bool operator==(const uint160& a, const uint160& b)              { return (base_uint160)a == (base_uint160)b; }
+inline bool operator!=(const uint160& a, const uint160& b)              { return (base_uint160)a != (base_uint160)b; }
+inline const uint160 operator^(const uint160& a, const uint160& b)      { return (base_uint160)a ^  (base_uint160)b; }
+inline const uint160 operator&(const uint160& a, const uint160& b)      { return (base_uint160)a &  (base_uint160)b; }
+inline const uint160 operator|(const uint160& a, const uint160& b)      { return (base_uint160)a |  (base_uint160)b; }
+inline const uint160 operator+(const uint160& a, const uint160& b)      { return (base_uint160)a +  (base_uint160)b; }
+inline const uint160 operator-(const uint160& a, const uint160& b)      { return (base_uint160)a -  (base_uint160)b; }
+
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// uint256
+//
+
+/** 256-bit unsigned integer */
+class uint256 : public base_uint256
+{
+public:
+    typedef base_uint256 basetype;
+
+    uint256()
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    uint256(const basetype& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+    }
+
+    uint256& operator=(const basetype& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+        return *this;
+    }
+
+    uint256(uint64 b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    uint256& operator=(uint64 b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+        return *this;
+    }
+
+    explicit uint256(const std::string& str)
+    {
+        SetHex(str);
+    }
+
+    explicit uint256(const std::vector<unsigned char>& vch)
+    {
+        if (vch.size() == sizeof(pn))
+            memcpy(pn, &vch[0], sizeof(pn));
+        else
+            *this = 0;
+    }
+};
+
+inline bool operator==(const uint256& a, uint64 b)                           { return (base_uint256)a == b; }
+inline bool operator!=(const uint256& a, uint64 b)                           { return (base_uint256)a != b; }
+inline const uint256 operator<<(const base_uint256& a, unsigned int shift)   { return uint256(a) <<= shift; }
+inline const uint256 operator>>(const base_uint256& a, unsigned int shift)   { return uint256(a) >>= shift; }
+inline const uint256 operator<<(const uint256& a, unsigned int shift)        { return uint256(a) <<= shift; }
+inline const uint256 operator>>(const uint256& a, unsigned int shift)        { return uint256(a) >>= shift; }
+
+inline const uint256 operator^(const base_uint256& a, const base_uint256& b) { return uint256(a) ^= b; }
+inline const uint256 operator&(const base_uint256& a, const base_uint256& b) { return uint256(a) &= b; }
+inline const uint256 operator|(const base_uint256& a, const base_uint256& b) { return uint256(a) |= b; }
+inline const uint256 operator+(const base_uint256& a, const base_uint256& b) { return uint256(a) += b; }
+inline const uint256 operator-(const base_uint256& a, const base_uint256& b) { return uint256(a) -= b; }
+
+inline bool operator<(const base_uint256& a, const uint256& b)          { return (base_uint256)a <  (base_uint256)b; }
+inline bool operator<=(const base_uint256& a, const uint256& b)         { return (base_uint256)a <= (base_uint256)b; }
+inline bool operator>(const base_uint256& a, const uint256& b)          { return (base_uint256)a >  (base_uint256)b; }
+inline bool operator>=(const base_uint256& a, const uint256& b)         { return (base_uint256)a >= (base_uint256)b; }
+inline bool operator==(const base_uint256& a, const uint256& b)         { return (base_uint256)a == (base_uint256)b; }
+inline bool operator!=(const base_uint256& a, const uint256& b)         { return (base_uint256)a != (base_uint256)b; }
+inline const uint256 operator^(const base_uint256& a, const uint256& b) { return (base_uint256)a ^  (base_uint256)b; }
+inline const uint256 operator&(const base_uint256& a, const uint256& b) { return (base_uint256)a &  (base_uint256)b; }
+inline const uint256 operator|(const base_uint256& a, const uint256& b) { return (base_uint256)a |  (base_uint256)b; }
+inline const uint256 operator+(const base_uint256& a, const uint256& b) { return (base_uint256)a +  (base_uint256)b; }
+inline const uint256 operator-(const base_uint256& a, const uint256& b) { return (base_uint256)a -  (base_uint256)b; }
+
+inline bool operator<(const uint256& a, const base_uint256& b)          { return (base_uint256)a <  (base_uint256)b; }
+inline bool operator<=(const uint256& a, const base_uint256& b)         { return (base_uint256)a <= (base_uint256)b; }
+inline bool operator>(const uint256& a, const base_uint256& b)          { return (base_uint256)a >  (base_uint256)b; }
+inline bool operator>=(const uint256& a, const base_uint256& b)         { return (base_uint256)a >= (base_uint256)b; }
+inline bool operator==(const uint256& a, const base_uint256& b)         { return (base_uint256)a == (base_uint256)b; }
+inline bool operator!=(const uint256& a, const base_uint256& b)         { return (base_uint256)a != (base_uint256)b; }
+inline const uint256 operator^(const uint256& a, const base_uint256& b) { return (base_uint256)a ^  (base_uint256)b; }
+inline const uint256 operator&(const uint256& a, const base_uint256& b) { return (base_uint256)a &  (base_uint256)b; }
+inline const uint256 operator|(const uint256& a, const base_uint256& b) { return (base_uint256)a |  (base_uint256)b; }
+inline const uint256 operator+(const uint256& a, const base_uint256& b) { return (base_uint256)a +  (base_uint256)b; }
+inline const uint256 operator-(const uint256& a, const base_uint256& b) { return (base_uint256)a -  (base_uint256)b; }
+
+inline bool operator<(const uint256& a, const uint256& b)               { return (base_uint256)a <  (base_uint256)b; }
+inline bool operator<=(const uint256& a, const uint256& b)              { return (base_uint256)a <= (base_uint256)b; }
+inline bool operator>(const uint256& a, const uint256& b)               { return (base_uint256)a >  (base_uint256)b; }
+inline bool operator>=(const uint256& a, const uint256& b)              { return (base_uint256)a >= (base_uint256)b; }
+inline bool operator==(const uint256& a, const uint256& b)              { return (base_uint256)a == (base_uint256)b; }
+inline bool operator!=(const uint256& a, const uint256& b)              { return (base_uint256)a != (base_uint256)b; }
+inline const uint256 operator^(const uint256& a, const uint256& b)      { return (base_uint256)a ^  (base_uint256)b; }
+inline const uint256 operator&(const uint256& a, const uint256& b)      { return (base_uint256)a &  (base_uint256)b; }
+inline const uint256 operator|(const uint256& a, const uint256& b)      { return (base_uint256)a |  (base_uint256)b; }
+inline const uint256 operator+(const uint256& a, const uint256& b)      { return (base_uint256)a +  (base_uint256)b; }
+inline const uint256 operator-(const uint256& a, const uint256& b)      { return (base_uint256)a -  (base_uint256)b; }
+
+
+
+
+
+
+
+
+
+
+#ifdef TEST_UINT256
+
+inline int Testuint256AdHoc(std::vector<std::string> vArg)
+{
+    uint256 g(0);
+
+
+    printf("%s\n", g.ToString().c_str());
+    g--;  printf("g--\n");
+    printf("%s\n", g.ToString().c_str());
+    g--;  printf("g--\n");
+    printf("%s\n", g.ToString().c_str());
+    g++;  printf("g++\n");
+    printf("%s\n", g.ToString().c_str());
+    g++;  printf("g++\n");
+    printf("%s\n", g.ToString().c_str());
+    g++;  printf("g++\n");
+    printf("%s\n", g.ToString().c_str());
+    g++;  printf("g++\n");
+    printf("%s\n", g.ToString().c_str());
+
+
+
+    uint256 a(7);
+    printf("a=7\n");
+    printf("%s\n", a.ToString().c_str());
+
+    uint256 b;
+    printf("b undefined\n");
+    printf("%s\n", b.ToString().c_str());
+    int c = 3;
+
+    a = c;
+    a.pn[3] = 15;
+    printf("%s\n", a.ToString().c_str());
+    uint256 k(c);
+
+    a = 5;
+    a.pn[3] = 15;
+    printf("%s\n", a.ToString().c_str());
+    b = 1;
+    b <<= 52;
+
+    a |= b;
+
+    a ^= 0x500;
+
+    printf("a %s\n", a.ToString().c_str());
+
+    a = a | b | (uint256)0x1000;
+
+
+    printf("a %s\n", a.ToString().c_str());
+    printf("b %s\n", b.ToString().c_str());
+
+    a = 0xfffffffe;
+    a.pn[4] = 9;
+
+    printf("%s\n", a.ToString().c_str());
+    a++;
+    printf("%s\n", a.ToString().c_str());
+    a++;
+    printf("%s\n", a.ToString().c_str());
+    a++;
+    printf("%s\n", a.ToString().c_str());
+    a++;
+    printf("%s\n", a.ToString().c_str());
+
+    a--;
+    printf("%s\n", a.ToString().c_str());
+    a--;
+    printf("%s\n", a.ToString().c_str());
+    a--;
+    printf("%s\n", a.ToString().c_str());
+    uint256 d = a--;
+    printf("%s\n", d.ToString().c_str());
+    printf("%s\n", a.ToString().c_str());
+    a--;
+    printf("%s\n", a.ToString().c_str());
+    a--;
+    printf("%s\n", a.ToString().c_str());
+
+    d = a;
+
+    printf("%s\n", d.ToString().c_str());
+    for (int i = uint256::WIDTH-1; i >= 0; i--) printf("%08x", d.pn[i]); printf("\n");
+
+    uint256 neg = d;
+    neg = ~neg;
+    printf("%s\n", neg.ToString().c_str());
+
+
+    uint256 e = uint256("0xABCDEF123abcdef12345678909832180000011111111");
+    printf("\n");
+    printf("%s\n", e.ToString().c_str());
+
+
+    printf("\n");
+    uint256 x1 = uint256("0xABCDEF123abcdef12345678909832180000011111111");
+    uint256 x2;
+    printf("%s\n", x1.ToString().c_str());
+    for (int i = 0; i < 270; i += 4)
+    {
+        x2 = x1 << i;
+        printf("%s\n", x2.ToString().c_str());
+    }
+
+    printf("\n");
+    printf("%s\n", x1.ToString().c_str());
+    for (int i = 0; i < 270; i += 4)
+    {
+        x2 = x1;
+        x2 >>= i;
+        printf("%s\n", x2.ToString().c_str());
+    }
+
+
+    for (int i = 0; i < 100; i++)
+    {
+        uint256 k = (~uint256(0) >> i);
+        printf("%s\n", k.ToString().c_str());
+    }
+
+    for (int i = 0; i < 100; i++)
+    {
+        uint256 k = (~uint256(0) << i);
+        printf("%s\n", k.ToString().c_str());
+    }
+
+    return (0);
+}
+
+#endif
+
+#endif
diff --git a/util.c b/util.c
new file mode 100644
index 0000000..3f392bf
--- /dev/null
+++ b/util.c
@@ -0,0 +1,1316 @@
+/*
+ * Copyright 2010 Jeff Garzik
+ * Copyright 2012-2014 pooler
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#define _GNU_SOURCE
+#include "cpuminer-config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <jansson.h>
+#include <curl/curl.h>
+#include <time.h>
+#if defined(WIN32)
+#include <winsock2.h>
+#include <mstcpip.h>
+#else
+#include <errno.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#endif
+#include "compat.h"
+#include "miner.h"
+#include "elist.h"
+
+struct data_buffer {
+	void		*buf;
+	size_t		len;
+};
+
+struct upload_buffer {
+	const void	*buf;
+	size_t		len;
+	size_t		pos;
+};
+
+struct header_info {
+	char		*lp_path;
+	char		*reason;
+	char		*stratum_url;
+};
+
+struct tq_ent {
+	void			*data;
+	struct list_head	q_node;
+};
+
+struct thread_q {
+	struct list_head	q;
+
+	bool frozen;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+};
+
+void applog(int prio, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog) {
+		va_list ap2;
+		char *buf;
+		int len;
+		
+		va_copy(ap2, ap);
+		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
+		va_end(ap2);
+		buf = alloca(len);
+		if (vsnprintf(buf, len, fmt, ap) >= 0)
+			syslog(prio, "%s", buf);
+	}
+#else
+	if (0) {}
+#endif
+	else {
+		char *f;
+		int len;
+		time_t now;
+		struct tm tm, *tm_p;
+
+		time(&now);
+
+		pthread_mutex_lock(&applog_lock);
+		tm_p = localtime(&now);
+		memcpy(&tm, tm_p, sizeof(tm));
+		pthread_mutex_unlock(&applog_lock);
+
+		len = (int)(40 + strlen(fmt) + 2);
+		f = (char*)alloca(len);
+		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
+			tm.tm_year + 1900,
+			tm.tm_mon + 1,
+			tm.tm_mday,
+			tm.tm_hour,
+			tm.tm_min,
+			tm.tm_sec,
+			fmt);
+		pthread_mutex_lock(&applog_lock);
+		vfprintf(stderr, f, ap);	/* atomic write to stderr */
+		fflush(stderr);
+		pthread_mutex_unlock(&applog_lock);
+	}
+	va_end(ap);
+}
+
+static void databuf_free(struct data_buffer *db)
+{
+	if (!db)
+		return;
+
+	free(db->buf);
+
+	memset(db, 0, sizeof(*db));
+}
+
+static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
+			  void *user_data)
+{
+	struct data_buffer *db = (struct data_buffer *)user_data;
+	size_t len = size * nmemb;
+	size_t oldlen, newlen;
+	void *newmem;
+	static const unsigned char zero = 0;
+
+	oldlen = db->len;
+	newlen = oldlen + len;
+
+	newmem = realloc(db->buf, newlen + 1);
+	if (!newmem)
+		return 0;
+
+	db->buf = newmem;
+	db->len = newlen;
+	memcpy((char*)db->buf + oldlen, ptr, len);
+	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
+
+	return len;
+}
+
+static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
+			     void *user_data)
+{
+	struct upload_buffer *ub = (struct upload_buffer *)user_data;
+	unsigned int len = (unsigned int)(size * nmemb);
+
+	if (len > ub->len - ub->pos)
+		len = (unsigned int)(ub->len - ub->pos);
+
+	if (len) {
+		memcpy(ptr, (char*)ub->buf + ub->pos, len);
+		ub->pos += len;
+	}
+
+	return len;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x071200
+static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
+{
+	struct upload_buffer *ub = (struct upload_buffer *)user_data;
+	
+	switch (origin) {
+	case SEEK_SET:
+		ub->pos = (size_t)offset;
+		break;
+	case SEEK_CUR:
+		ub->pos += (size_t)offset;
+		break;
+	case SEEK_END:
+		ub->pos = ub->len + (size_t)offset;
+		break;
+	default:
+		return 1; /* CURL_SEEKFUNC_FAIL */
+	}
+
+	return 0; /* CURL_SEEKFUNC_OK */
+}
+#endif
+
+static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
+{
+	struct header_info *hi = (struct header_info *)user_data;
+	size_t remlen, slen, ptrlen = size * nmemb;
+	char *rem, *val = NULL, *key = NULL;
+	void *tmp;
+
+	val = (char*)calloc(1, ptrlen);
+	key = (char*)calloc(1, ptrlen);
+	if (!key || !val)
+		goto out;
+
+	tmp = memchr(ptr, ':', ptrlen);
+	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
+		goto out;
+	slen = (size_t)((char*)tmp - (char*)ptr);
+	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
+		goto out;
+	memcpy(key, ptr, slen);		/* store & nul term key */
+	key[slen] = 0;
+
+	rem = (char*)ptr + slen + 1;		/* trim value's leading whitespace */
+	remlen = ptrlen - slen - 1;
+	while ((remlen > 0) && (isspace(*rem))) {
+		remlen--;
+		rem++;
+	}
+
+	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
+	val[remlen] = 0;
+	while ((*val) && (isspace(val[strlen(val) - 1]))) {
+		val[strlen(val) - 1] = 0;
+	}
+	if (!*val)			/* skip blank value */
+		goto out;
+
+	if (!strcasecmp("X-Long-Polling", key)) {
+		hi->lp_path = val;	/* steal memory reference */
+		val = NULL;
+	}
+
+	if (!strcasecmp("X-Reject-Reason", key)) {
+		hi->reason = val;	/* steal memory reference */
+		val = NULL;
+	}
+
+	if (!strcasecmp("X-Stratum", key)) {
+		hi->stratum_url = val;	/* steal memory reference */
+		val = NULL;
+	}
+
+out:
+	free(key);
+	free(val);
+	return ptrlen;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x070f06
+static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
+	curlsocktype purpose)
+{
+	int keepalive = 1;
+	int tcp_keepcnt = 3;
+	int tcp_keepidle = 50;
+	int tcp_keepintvl = 50;
+#ifdef WIN32
+	DWORD outputBytes;
+#endif
+
+#ifndef WIN32	
+	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
+		sizeof(keepalive))))
+		return 1;
+#ifdef __linux
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
+		&tcp_keepcnt, sizeof(tcp_keepcnt))))
+		return 1;
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
+		&tcp_keepidle, sizeof(tcp_keepidle))))
+		return 1;
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
+		&tcp_keepintvl, sizeof(tcp_keepintvl))))
+		return 1;
+#endif /* __linux */
+#ifdef __APPLE_CC__
+	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
+		&tcp_keepintvl, sizeof(tcp_keepintvl))))
+		return 1;
+#endif /* __APPLE_CC__ */
+#else /* WIN32 */
+	struct tcp_keepalive vals;
+	vals.onoff = 1;
+	vals.keepalivetime = tcp_keepidle * 1000;
+	vals.keepaliveinterval = tcp_keepintvl * 1000;	
+	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
+		NULL, 0, &outputBytes, NULL, NULL)))
+		return 1;
+#endif /* WIN32 */
+
+	return 0;
+}
+#endif
+
+json_t *json_rpc_call(CURL *curl, const char *url,
+		      const char *userpass, const char *rpc_req,
+		      bool longpoll_scan, bool longpoll, int *curl_err)
+{
+	json_t *val, *err_val, *res_val;
+	int rc;
+	struct data_buffer all_data = {0};
+	struct upload_buffer upload_data;
+	json_error_t err;
+	struct curl_slist *headers = NULL;
+	char len_hdr[64];
+	char curl_err_str[CURL_ERROR_SIZE];
+	long timeout = longpoll ? opt_timeout : 30;
+	struct header_info hi = {0};
+	bool lp_scanning = longpoll_scan && !have_longpoll;
+
+	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	if (opt_cert)
+		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
+	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
+#if LIBCURL_VERSION_NUM >= 0x071200
+	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
+	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
+#endif
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
+	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
+	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
+	if (opt_proxy) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	}
+	if (userpass) {
+		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
+		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
+	}
+#if LIBCURL_VERSION_NUM >= 0x070f06
+	if (longpoll)
+		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
+#endif
+	curl_easy_setopt(curl, CURLOPT_POST, 1);
+
+	if (opt_protocol)
+		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
+
+	upload_data.buf = rpc_req;
+	upload_data.len = strlen(rpc_req);
+	upload_data.pos = 0;
+	sprintf(len_hdr, "Content-Length: %lu",
+		(unsigned long) upload_data.len);
+
+	headers = curl_slist_append(headers, "Content-Type: application/json");
+	headers = curl_slist_append(headers, len_hdr);
+	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+	headers = curl_slist_append(headers, "X-Mining-Extensions: midstate");
+	headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
+	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	rc = curl_easy_perform(curl);
+	if (curl_err != NULL)
+		*curl_err = rc;
+	if (rc) {
+		if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT))
+			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
+		goto err_out;
+	}
+
+	/* If X-Stratum was found, activate Stratum */
+	if (want_stratum && hi.stratum_url &&
+	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
+	    !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) {
+		have_stratum = true;
+		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
+		hi.stratum_url = NULL;
+	}
+
+	/* If X-Long-Polling was found, activate long polling */
+	if (lp_scanning && hi.lp_path && !have_stratum) {
+		have_longpoll = true;
+		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
+		hi.lp_path = NULL;
+	}
+
+	if (!all_data.buf) {
+		applog(LOG_ERR, "Empty data received in json_rpc_call.");
+		goto err_out;
+	}
+
+	val = JSON_LOADS((const char*)all_data.buf, &err);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto err_out;
+	}
+
+	if (opt_protocol) {
+		char *s = json_dumps(val, JSON_INDENT(3));
+		applog(LOG_DEBUG, "JSON protocol response:\n%s", s);
+		free(s);
+	}
+
+	/* JSON-RPC valid response returns a non-null 'result',
+	 * and a null 'error'. */
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_null(res_val) ||
+	    (err_val && !json_is_null(err_val))) {
+		char *s;
+
+		if (err_val)
+			s = json_dumps(err_val, JSON_INDENT(3));
+		else
+			s = strdup("(unknown reason)");
+
+		applog(LOG_ERR, "JSON-RPC call failed: %s", s);
+
+		free(s);
+
+		goto err_out;
+	}
+
+	if (hi.reason)
+		json_object_set_new(val, "reject-reason", json_string(hi.reason));
+
+	databuf_free(&all_data);
+	curl_slist_free_all(headers);
+	curl_easy_reset(curl);
+	return val;
+
+err_out:
+	free(hi.lp_path);
+	free(hi.reason);
+	free(hi.stratum_url);
+	databuf_free(&all_data);
+	curl_slist_free_all(headers);
+	curl_easy_reset(curl);
+	return NULL;
+}
+
+char *bin2hex(const unsigned char *p, size_t len)
+{
+	unsigned int i;
+	char *s = (char*)malloc((len * 2) + 1);
+	if (!s)
+		return NULL;
+
+	for (i = 0; i < len; i++)
+		sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
+
+	return s;
+}
+
+bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+{
+	char hex_byte[3];
+	char *ep;
+
+	hex_byte[2] = '\0';
+
+	while (*hexstr && len) {
+		if (!hexstr[1]) {
+			applog(LOG_ERR, "hex2bin str truncated");
+			return false;
+		}
+		hex_byte[0] = hexstr[0];
+		hex_byte[1] = hexstr[1];
+		*p = (unsigned char) strtol(hex_byte, &ep, 16);
+		if (*ep) {
+			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
+			return false;
+		}
+		p++;
+		hexstr += 2;
+		len--;
+	}
+
+	return (len == 0 && *hexstr == 0) ? true : false;
+}
+
+/* Subtract the `struct timeval' values X and Y,
+   storing the result in RESULT.
+   Return 1 if the difference is negative, otherwise 0.  */
+int timeval_subtract(struct timeval *result, struct timeval *x,
+	struct timeval *y)
+{
+	/* Perform the carry for the later subtraction by updating Y. */
+	if (x->tv_usec < y->tv_usec) {
+		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
+		y->tv_usec -= 1000000 * nsec;
+		y->tv_sec += nsec;
+	}
+	if (x->tv_usec - y->tv_usec > 1000000) {
+		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
+		y->tv_usec += 1000000 * nsec;
+		y->tv_sec -= nsec;
+	}
+
+	/* Compute the time remaining to wait.
+	 * `tv_usec' is certainly positive. */
+	result->tv_sec = x->tv_sec - y->tv_sec;
+	result->tv_usec = x->tv_usec - y->tv_usec;
+
+	/* Return 1 if result is negative. */
+	return x->tv_sec < y->tv_sec;
+}
+
+bool fulltest(const uint32_t *hash, const uint32_t *target)
+{
+	int i;
+	bool rc = true;
+	
+	for (i = 7; i >= 0; i--) {
+		if (hash[i] > target[i]) {
+			rc = false;
+			break;
+		}
+		if (hash[i] < target[i]) {
+			rc = true;
+			break;
+		}
+	}
+
+	if (opt_debug) {
+		uint32_t hash_be[8], target_be[8];
+		char *hash_str, *target_str;
+		
+		for (i = 0; i < 8; i++) {
+			be32enc(hash_be + i, hash[7 - i]);
+			be32enc(target_be + i, target[7 - i]);
+		}
+		hash_str = bin2hex((unsigned char *)hash_be, 32);
+		target_str = bin2hex((unsigned char *)target_be, 32);
+
+		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
+			rc ? "hash <= target"
+			   : "hash > target (false positive)",
+			hash_str,
+			target_str);
+
+		free(hash_str);
+		free(target_str);
+	}
+
+	return rc;
+}
+
+void diff_to_target(uint32_t *target, double diff)
+{
+	uint64_t m;
+	int k;
+	
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target, 0xff, 32);
+	else {
+		memset(target, 0, 32);
+		target[k] = (uint32_t)m;
+		target[k + 1] = (uint32_t)(m >> 32);
+	}
+}
+
+#ifdef WIN32
+#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
+#else
+#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
+#endif
+
+static bool send_line(curl_socket_t sock, char *s)
+{
+	ssize_t len, sent = 0;
+	
+	len = (ssize_t)strlen(s);
+	s[len++] = '\n';
+
+	while (len > 0) {
+		struct timeval timeout = {0, 0};
+		ssize_t n;
+		fd_set wd;
+
+		FD_ZERO(&wd);
+		FD_SET(sock, &wd);
+		if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
+			return false;
+		n = send(sock, s + sent, len, 0);
+		if (n < 0) {
+			if (!socket_blocks())
+				return false;
+			n = 0;
+		}
+		sent += n;
+		len -= n;
+	}
+
+	return true;
+}
+
+bool stratum_send_line(struct stratum_ctx *sctx, char *s)
+{
+	bool ret = false;
+
+	if (opt_protocol)
+		applog(LOG_DEBUG, "> %s", s);
+
+	pthread_mutex_lock(&sctx->sock_lock);
+	ret = send_line(sctx->sock, s);
+	pthread_mutex_unlock(&sctx->sock_lock);
+
+	return ret;
+}
+
+static bool socket_full(curl_socket_t sock, int timeout)
+{
+	struct timeval tv;
+	fd_set rd;
+
+	FD_ZERO(&rd);
+	FD_SET(sock, &rd);
+	tv.tv_sec = timeout;
+	tv.tv_usec = 0;
+	if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
+		return true;
+	return false;
+}
+
+bool stratum_socket_full(struct stratum_ctx *sctx, int timeout)
+{
+	return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout);
+}
+
+#define RBUFSIZE 2048
+#define RECVSIZE (RBUFSIZE - 4)
+
+static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
+{
+	size_t old, snew;
+
+	old = strlen(sctx->sockbuf);
+	snew = old + strlen(s) + 1;
+	if (snew >= sctx->sockbuf_size) {
+		sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE));
+		sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size);
+	}
+	strcpy(sctx->sockbuf + old, s);
+}
+
+char *stratum_recv_line(struct stratum_ctx *sctx)
+{
+	ssize_t len, buflen;
+	char *tok, *sret = NULL;
+
+	if (!strstr(sctx->sockbuf, "\n")) {
+		bool ret = true;
+		time_t rstart;
+
+		time(&rstart);
+		if (!socket_full(sctx->sock, 60)) {
+			applog(LOG_ERR, "stratum_recv_line timed out");
+			goto out;
+		}
+		do {
+			char s[RBUFSIZE];
+			ssize_t n;
+
+			memset(s, 0, RBUFSIZE);
+			n = recv(sctx->sock, s, RECVSIZE, 0);
+			if (!n) {
+				ret = false;
+				break;
+			}
+			if (n < 0) {
+				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
+					ret = false;
+					break;
+				}
+			} else
+				stratum_buffer_append(sctx, s);
+		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
+
+		if (!ret) {
+			applog(LOG_ERR, "stratum_recv_line failed");
+			goto out;
+		}
+	}
+
+	buflen = (ssize_t)strlen(sctx->sockbuf);
+	tok = strtok(sctx->sockbuf, "\n");
+	if (!tok) {
+		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
+		goto out;
+	}
+	sret = strdup(tok);
+	len = (ssize_t)strlen(sret);
+
+	if (buflen > len + 1)
+		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
+	else
+		sctx->sockbuf[0] = '\0';
+
+out:
+	if (sret && opt_protocol)
+		applog(LOG_DEBUG, "< %s", sret);
+	return sret;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x071101
+static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
+	struct curl_sockaddr *addr)
+{
+	curl_socket_t *sock = (curl_socket_t *)clientp;
+	*sock = socket(addr->family, addr->socktype, addr->protocol);
+	return *sock;
+}
+#endif
+
+bool stratum_connect(struct stratum_ctx *sctx, const char *url)
+{
+	CURL *curl;
+	int rc;
+
+	pthread_mutex_lock(&sctx->sock_lock);
+	if (sctx->curl)
+		curl_easy_cleanup(sctx->curl);
+	sctx->curl = curl_easy_init();
+	if (!sctx->curl) {
+		applog(LOG_ERR, "CURL initialization failed");
+		pthread_mutex_unlock(&sctx->sock_lock);
+		return false;
+	}
+	curl = sctx->curl;
+	if (!sctx->sockbuf) {
+		sctx->sockbuf = (char*)calloc(RBUFSIZE, 1);
+		sctx->sockbuf_size = RBUFSIZE;
+	}
+	sctx->sockbuf[0] = '\0';
+	pthread_mutex_unlock(&sctx->sock_lock);
+
+	if (url != sctx->url) {
+		free(sctx->url);
+		sctx->url = strdup(url);
+	}
+	free(sctx->curl_url);
+	sctx->curl_url = (char*)malloc(strlen(url));
+	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
+	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
+	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	} else if (getenv("http_proxy")) {
+		if (getenv("all_proxy"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
+		else if (getenv("ALL_PROXY"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
+		else
+			curl_easy_setopt(curl, CURLOPT_PROXY, "");
+	}
+#if LIBCURL_VERSION_NUM >= 0x070f06
+	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
+#endif
+#if LIBCURL_VERSION_NUM >= 0x071101
+	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
+	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
+#endif
+	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);
+
+	rc = curl_easy_perform(curl);
+	if (rc) {
+		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
+		curl_easy_cleanup(curl);
+		sctx->curl = NULL;
+		return false;
+	}
+
+#if LIBCURL_VERSION_NUM < 0x071101
+	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
+	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
+#endif
+
+	return true;
+}
+
+void stratum_disconnect(struct stratum_ctx *sctx)
+{
+	pthread_mutex_lock(&sctx->sock_lock);
+	if (sctx->curl) {
+		curl_easy_cleanup(sctx->curl);
+		sctx->curl = NULL;
+		sctx->sockbuf[0] = '\0';
+	}
+	pthread_mutex_unlock(&sctx->sock_lock);
+}
+
+static const char *get_stratum_session_id(json_t *val)
+{
+	json_t *arr_val;
+	int i, n;
+
+	arr_val = json_array_get(val, 0);
+	if (!arr_val || !json_is_array(arr_val))
+		return NULL;
+	n = json_array_size(arr_val);
+	for (i = 0; i < n; i++) {
+		const char *notify;
+		json_t *arr = json_array_get(arr_val, i);
+
+		if (!arr || !json_is_array(arr))
+			break;
+		notify = json_string_value(json_array_get(arr, 0));
+		if (!notify)
+			continue;
+		if (!strcasecmp(notify, "mining.notify"))
+			return json_string_value(json_array_get(arr, 1));
+	}
+	return NULL;
+}
+
+bool stratum_subscribe(struct stratum_ctx *sctx)
+{
+	char *s, *sret = NULL;
+	const char *sid, *xnonce1;
+	int xn2_size;
+	json_t *val = NULL, *res_val, *err_val;
+	json_error_t err;
+	bool ret = false, retry = false;
+
+start:
+	s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
+	if (retry)
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
+	else if (sctx->session_id)
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
+	else
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	if (!socket_full(sctx->sock, 30)) {
+		applog(LOG_ERR, "stratum_subscribe timed out");
+		goto out;
+	}
+
+	sret = stratum_recv_line(sctx);
+	if (!sret)
+		goto out;
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_null(res_val) ||
+	    (err_val && !json_is_null(err_val))) {
+		if (opt_debug || retry) {
+			free(s);
+			if (err_val)
+				s = json_dumps(err_val, JSON_INDENT(3));
+			else
+				s = strdup("(unknown reason)");
+			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
+		}
+		goto out;
+	}
+
+	sid = get_stratum_session_id(res_val);
+	if (opt_debug && !sid)
+		applog(LOG_DEBUG, "Failed to get Stratum session id");
+	xnonce1 = json_string_value(json_array_get(res_val, 1));
+	if (!xnonce1) {
+		applog(LOG_ERR, "Failed to get extranonce1");
+		goto out;
+	}
+	xn2_size = json_integer_value(json_array_get(res_val, 2));
+	if (!xn2_size) {
+		applog(LOG_ERR, "Failed to get extranonce2_size");
+		goto out;
+	}
+
+	pthread_mutex_lock(&sctx->work_lock);
+	free(sctx->session_id);
+	free(sctx->xnonce1);
+	sctx->session_id = sid ? strdup(sid) : NULL;
+	sctx->xnonce1_size = strlen(xnonce1) / 2;
+	sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size);
+	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
+	sctx->xnonce2_size = xn2_size;
+	sctx->next_diff = 1.0;
+	pthread_mutex_unlock(&sctx->work_lock);
+
+	if (opt_debug && sid)
+		applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id);
+
+	ret = true;
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	if (!ret) {
+		if (sret && !retry) {
+			retry = true;
+			goto start;
+		}
+	}
+
+	return ret;
+}
+
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
+{
+	json_t *val = NULL, *res_val, *err_val;
+	char *s, *sret;
+	json_error_t err;
+	bool ret = false;
+
+	s = (char*)malloc(80 + strlen(user) + strlen(pass));
+	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
+	        user, pass);
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	while (1) {
+		sret = stratum_recv_line(sctx);
+		if (!sret)
+			goto out;
+		if (!stratum_handle_method(sctx, sret))
+			break;
+		free(sret);
+	}
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_false(res_val) ||
+	    (err_val && !json_is_null(err_val)))  {
+		applog(LOG_ERR, "Stratum authentication failed");
+		goto out;
+	}
+
+	ret = true;
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
+{
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward;
+	size_t coinb1_size, coinb2_size;
+	bool clean, ret = false;
+	int merkle_count, i;
+	json_t *merkle_arr;
+	unsigned char **merkle;
+
+	job_id = json_string_value(json_array_get(params, 0));
+	prevhash = json_string_value(json_array_get(params, 1));
+	coinb1 = json_string_value(json_array_get(params, 2));
+	coinb2 = json_string_value(json_array_get(params, 3));
+	merkle_arr = json_array_get(params, 4);
+	if (!merkle_arr || !json_is_array(merkle_arr))
+		goto out;
+	merkle_count = json_array_size(merkle_arr);
+	version = json_string_value(json_array_get(params, 5));
+	nbits = json_string_value(json_array_get(params, 6));
+	ntime = json_string_value(json_array_get(params, 7));
+	clean = json_is_true(json_array_get(params, 8));
+	nreward = json_string_value(json_array_get(params, 9));
+
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime ||
+	    strlen(prevhash) != 64 || strlen(version) != 8 ||
+	    strlen(nbits) != 8 || strlen(ntime) != 8 || strlen(nreward) != 4) {
+		applog(LOG_ERR, "Stratum notify: invalid parameters");
+		goto out;
+	}
+	merkle = (unsigned char**)malloc(merkle_count * sizeof(char *));
+	for (i = 0; i < merkle_count; i++) {
+		const char *s = json_string_value(json_array_get(merkle_arr, i));
+		if (!s || strlen(s) != 64) {
+			while (i--)
+				free(merkle[i]);
+			free(merkle);
+			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
+			goto out;
+		}
+		merkle[i] = (unsigned char*)malloc(32);
+		hex2bin(merkle[i], s, 32);
+	}
+
+	pthread_mutex_lock(&sctx->work_lock);
+
+	coinb1_size = strlen(coinb1) / 2;
+	coinb2_size = strlen(coinb2) / 2;
+	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
+	                          sctx->xnonce2_size + coinb2_size;
+	sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
+	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
+	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
+	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
+	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
+
+	free(sctx->job.job_id);
+	sctx->job.job_id = strdup(job_id);
+	hex2bin(sctx->job.prevhash, prevhash, 32);
+
+	for (i = 0; i < sctx->job.merkle_count; i++)
+		free(sctx->job.merkle[i]);
+	free(sctx->job.merkle);
+	sctx->job.merkle = merkle;
+	sctx->job.merkle_count = merkle_count;
+
+	hex2bin(sctx->job.version, version, 4);
+	hex2bin(sctx->job.nbits, nbits, 4);
+	hex2bin(sctx->job.ntime, ntime, 4);
+	hex2bin(sctx->job.nreward, nreward, 2);
+	sctx->job.clean = clean;
+
+	sctx->job.diff = sctx->next_diff;
+
+	pthread_mutex_unlock(&sctx->work_lock);
+
+	ret = true;
+
+out:
+	return ret;
+}
+
+static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
+{
+	double diff;
+
+	diff = json_number_value(json_array_get(params, 0));
+	if (diff == 0)
+		return false;
+
+	pthread_mutex_lock(&sctx->work_lock);
+	sctx->next_diff = diff;
+	pthread_mutex_unlock(&sctx->work_lock);
+
+	if (opt_debug)
+		applog(LOG_DEBUG, "Stratum difficulty set to %g", diff);
+
+	return true;
+}
+
+static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
+{
+	json_t *port_val;
+	const char *host;
+	int port;
+
+	host = json_string_value(json_array_get(params, 0));
+	port_val = json_array_get(params, 1);
+	if (json_is_string(port_val))
+		port = atoi(json_string_value(port_val));
+	else
+		port = json_integer_value(port_val);
+	if (!host || !port)
+		return false;
+	
+	free(sctx->url);
+	sctx->url = (char*)malloc(32 + strlen(host));
+	sprintf(sctx->url, "stratum+tcp://%s:%d", host, port);
+
+	applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url);
+
+	stratum_disconnect(sctx);
+
+	return true;
+}
+
+static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+	
+	if (!id || json_is_null(id))
+		return false;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_string(USER_AGENT));
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+
+	val = json_array_get(params, 0);
+	if (val)
+		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
+	
+	if (!id || json_is_null(id))
+		return true;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_true());
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
+{
+	json_t *val, *id, *params;
+	json_error_t err;
+	const char *method;
+	bool ret = false;
+
+	val = JSON_LOADS(s, &err);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	method = json_string_value(json_object_get(val, "method"));
+	if (!method)
+		goto out;
+	id = json_object_get(val, "id");
+	params = json_object_get(val, "params");
+
+	if (!strcasecmp(method, "mining.notify")) {
+		ret = stratum_notify(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "mining.set_difficulty")) {
+		ret = stratum_set_difficulty(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.reconnect")) {
+		ret = stratum_reconnect(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.get_version")) {
+		ret = stratum_get_version(sctx, id);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.show_message")) {
+		ret = stratum_show_message(sctx, id, params);
+		goto out;
+	}
+
+out:
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+struct thread_q *tq_new(void)
+{
+	struct thread_q *tq;
+
+	tq = (struct thread_q *)calloc(1, sizeof(*tq));
+	if (!tq)
+		return NULL;
+
+	INIT_LIST_HEAD(&tq->q);
+	pthread_mutex_init(&tq->mutex, NULL);
+	pthread_cond_init(&tq->cond, NULL);
+
+	return tq;
+}
+
+void tq_free(struct thread_q *tq)
+{
+	struct tq_ent *ent, *iter;
+
+	if (!tq)
+		return;
+
+	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) {
+		list_del(&ent->q_node);
+		free(ent);
+	}
+
+	pthread_cond_destroy(&tq->cond);
+	pthread_mutex_destroy(&tq->mutex);
+
+	memset(tq, 0, sizeof(*tq));	/* poison */
+	free(tq);
+}
+
+static void tq_freezethaw(struct thread_q *tq, bool frozen)
+{
+	pthread_mutex_lock(&tq->mutex);
+
+	tq->frozen = frozen;
+
+	pthread_cond_signal(&tq->cond);
+	pthread_mutex_unlock(&tq->mutex);
+}
+
+void tq_freeze(struct thread_q *tq)
+{
+	tq_freezethaw(tq, true);
+}
+
+void tq_thaw(struct thread_q *tq)
+{
+	tq_freezethaw(tq, false);
+}
+
+bool tq_push(struct thread_q *tq, void *data)
+{
+	struct tq_ent *ent;
+	bool rc = true;
+
+	ent = (struct tq_ent *)calloc(1, sizeof(*ent));
+	if (!ent)
+		return false;
+
+	ent->data = data;
+	INIT_LIST_HEAD(&ent->q_node);
+
+	pthread_mutex_lock(&tq->mutex);
+
+	if (!tq->frozen) {
+		list_add_tail(&ent->q_node, &tq->q);
+	} else {
+		free(ent);
+		rc = false;
+	}
+
+	pthread_cond_signal(&tq->cond);
+	pthread_mutex_unlock(&tq->mutex);
+
+	return rc;
+}
+
+void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
+{
+	struct tq_ent *ent;
+	void *rval = NULL;
+	int rc;
+
+	pthread_mutex_lock(&tq->mutex);
+
+	if (!list_empty(&tq->q))
+		goto pop;
+
+	if (abstime)
+		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
+	else
+		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
+	if (rc)
+		goto out;
+	if (list_empty(&tq->q))
+		goto out;
+
+pop:
+	ent = list_entry(tq->q.next, struct tq_ent, q_node);
+	rval = ent->data;
+
+	list_del(&ent->q_node);
+	free(ent);
+
+out:
+	pthread_mutex_unlock(&tq->mutex);
+	return rval;
+}