/***
* ==++==
*
* Copyright (c) Microsoft Corporation. All rights reserved.
*
* ==--==
* =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
*
* ppl.h
*
* Parallel Patterns Library
*
* =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
****/

#pragma once

#include <crtdefs.h>
#include <concrt.h>
#include <stdexcept>
#include <iterator>
#include <functional>
#include <memory>
#include <type_traits>
#include <algorithm>

#include <pplconcrt.h>

#define _PPL_H

#pragma pack(push,_CRT_PACKING)
#pragma push_macro("new")
#undef new

// Define the level of tracing to use

#define _TRACE_LEVEL_INFORMATION 4

/// <summary>
///     The <c>Concurrency</c> namespace provides classes and functions that give you access to the Concurrency Runtime,
///     a concurrent programming framework for C++. For more information, see <see cref="Concurrency Runtime"/>.
/// </summary>
/**/
namespace Concurrency
{
namespace details
{
    _CRTIMP2 size_t __cdecl _GetCombinableSize();
} // namespace details

class structured_task_group;
class task_group;

/// <summary>
///     The <c>task_handle</c> class represents an individual parallel work item. It encapsulates the instructions and the data required
///     to execute a piece of work.
/// </summary>
/// <typeparam name="_Function">
///     The type of the function object that will be invoked to execute the work represented by the <c>task_handle</c> object.
/// </typeparam>
/// <remarks>
///     <c>task_handle</c> objects can be used in conjunction with a <c>structured_task_group</c> or a more general <c>task_group</c> object,
///     to decompose work into parallel tasks. For more information, see <see cref="Task Parallelism"/>.
///     <para>Note that the creator of a <c>task_handle</c> object is responsible for maintaining the lifetime of the created
///     <c>task_handle</c> object until it is no longer required by the Concurrency Runtime. Typically, this means that the <c>task_handle</c>
///     object must not destruct until either the <c>wait</c> or <c>run_and_wait</c> method of the <c>task_group</c> or
///     <c>structured_task_group</c> to which it is queued has been called.</para>
///     <para><c>task_handle</c> objects are typically used in conjunction with C++ lambdas. Because you do not know the true type of the lambda,
///     the <see cref="make_task Function">make_task</see> function is typically used to create a <c>task_handle</c> object.</para>
///     <para>The runtime creates a copy of the work function that you pass to a <c>task_handle</c> object. Therefore, any state changes that occur in a
///     function object that you pass to a <c>task_handle</c> object will not appear in your copy of that function object.</para>
/// </remarks>
/// <seealso cref="task_group Class"/>
/// <seealso cref="structured_task_group Class"/>
/// <seealso cref="make_task Function"/>
/// <seealso cref="task_group::run Method"/>
/// <seealso cref="task_group::wait Method"/>
/// <seealso cref="task_group::run_and_wait Method"/>
/// <seealso cref="structured_task_group::run Method"/>
/// <seealso cref="structured_task_group::wait Method"/>
/// <seealso cref="structured_task_group::run_and_wait Method"/>
/**/
template<typename _Function>
class task_handle : public ::Concurrency::details::_UnrealizedChore
{
public:
    /// <summary>
    ///     Constructs a new <c>task_handle</c> object. The work of the task is performed by invoking the function specified as
    ///     a parameter to the constructor.
    /// </summary>
    /// <param name="_Func">
    ///     The function that will be invoked to execute the work represented by the <c>task_handle</c> object. This may be a lambda functor,
    ///     a pointer to a function, or any object that supports a version of the function call operator with the signature <c>void operator()()</c>.
    /// </param>
    /// <remarks>
    ///     The runtime creates a copy of the work function that you pass to the constructor. Therefore, any state changes that occur in a function
    ///     object that you pass to a <c>task_handle</c> object will not appear in your copy of that function object.
    /// </remarks>
    /**/
    task_handle(const _Function& _Func) : _M_function(_Func)
    {
        m_pFunction = reinterpret_cast <TaskProc> (&::Concurrency::details::_UnrealizedChore::_InvokeBridge<task_handle>);
    }

    /// <summary>
    ///     Destroys the <c>task_handle</c> object.
    /// </summary>
    /**/
    ~task_handle()
    {
        //
        // We only need to perform a liveness check if the client owns the lifetime of the handle. Doing this for runtime owned handles
        // is not only unnecessary -- it is also dangerous.
        //
        if (_OwningCollection() != NULL && !_GetRuntimeOwnsLifetime())
        {
            _CheckTaskCollection();
        }
    }

    /// <summary>
    ///     The function call operator that the runtime invokes to perform the work of the task handle.
    /// </summary>
    /**/
    void operator()() const
    {
        _M_function();
    }

private:

    friend class task_group;
    friend class structured_task_group;

    // The function object invoked to perform the body of the task.
    _Function _M_function;

    task_handle const & operator=(task_handle const&);    // no assignment operator

};

/// <summary>
///     A factory method for creating a <c>task_handle</c> object.
/// </summary>
/// <typeparam name="_Function">
///     The type of the function object that will be invoked to execute the work represented by the <c>task_handle</c> object.
/// </typeparam>
/// <param name="_Func">
///     The function that will be invoked to execute the work represented by the <c>task_handle</c> object. This may be a lambda functor,
///     a pointer to a function, or any object that supports a version of the function call operator with the signature <c>void operator()()</c>.
/// </param>
/// <returns>
///     A <c>task_handle</c> object.
/// </returns>
/// <remarks>
///     This function is useful when you need to create a <c>task_handle</c> object with a lambda expression, because it allows you to
///     create the object without knowing the true type of the lambda functor.
/// </remarks>
/// <seealso cref="task_handle Class"/>
/// <seealso cref="task_group Class"/>
/// <seealso cref="structured_task_group Class"/>
/**/
template <class _Function>
task_handle<_Function> make_task(const _Function& _Func)
{
    return task_handle<_Function>(_Func);
}

/// <summary>
///     The <c>structured_task_group</c> class represents a highly structured collection of parallel work. You can queue individual parallel tasks to
///     a <c>structured_task_group</c> using <c>task_handle</c> objects, and wait for them to complete, or cancel the task group before they have finished
///     executing, which will abort any tasks that have not begun execution.
/// </summary>
/// <remarks>
///     There are a number of severe restrictions placed on usage of a <c>structured_task_group</c> object in order to gain performance:
///     <list type="bullet">
///     <item>
///     <description>A single <c>structured_task_group</c> object cannot be used by multiple threads. All operations on a <c>structured_task_group</c> object
///     must be performed by the thread that created the object. The two exceptions to this rule are the member functions <c>cancel</c> and
///     <c>is_canceling</c>. The object may not be in the capture list of a lambda expression and be used within a task, unless the task is using one
///     of the cancellation operations.</description>
///     </item>
///     <item>
///     <description>All tasks scheduled to a <c>structured_task_group</c> object are scheduled through the use of <c>task_handle</c> objects which
///     you must explicitly manage the lifetime of.</description>
///     </item>
///     <item>
///     <description>Multiple groups may only be used in absolutely nested order. If two <c>structured_task_group</c> objects are declared, the second
///     one being declared (the inner one) must destruct before any method except <c>cancel</c> or <c>is_canceling</c> is called on the first one
///     (the outer one). This condition holds true in both the case of simply declaring multiple <c>structured_task_group</c> objects within the same
///     or functionally nested scopes as well as the case of a task that was queued to the <c>structured_task_group</c> via the <c>run</c> or
///     <c>run_and_wait</c> methods.</description>
///     </item>
///     <item>
///     <description>Unlike the general <c>task_group</c> class, all states in the <c>structured_task_group</c> class are final. After you have queued tasks to the
///     group and waited for them to complete, you may not use the same group again.</description>
///     </item>
///     </list>
///     <para>For more information, see <see cref="Task Parallelism"/>.</para>
/// </remarks>
/// <seealso cref="task_group Class"/>
/// <seealso cref="task_handle Class"/>
/**/
class structured_task_group
{
public:

    /// <summary>
    ///     Constructs a new <c>structured_task_group</c> object.
    /// </summary>
    /// <remarks>
    ///     The constructor that takes a cancellation token creates a <c>structured_task_group</c> that will be canceled when the source associated with
    ///     the token is canceled. Providing an explicit cancellation token also isolates this structured task group from participating in an implicit
    ///     cancellation from a parent group with a different token or no token.
    /// </remarks>
    /// <seealso cref="Task Parallelism"/>
    /**/
    structured_task_group()
    {
    }

    /// <summary>
    ///     Constructs a new <c>structured_task_group</c> object.
    /// </summary>
    /// <param name="_CancellationToken">
    ///     A cancellation token to associate with this structured task group. The structured task group will be canceled when the token is canceled.
    /// </param>
    /// <remarks>
    ///     The constructor that takes a cancellation token creates a <c>structured_task_group</c> that will be canceled when the source associated with
    ///     the token is canceled. Providing an explicit cancellation token also isolates this structured task group from participating in an implicit
    ///     cancellation from a parent group with a different token or no token.
    /// </remarks>
    /// <seealso cref="Task Parallelism"/>
    /**/
    structured_task_group(cancellation_token _CancellationToken) :
        _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : Concurrency::details::_CancellationTokenState::_None())
    {
    }

    /// <summary>
    ///     Destroys a <c>structured_task_group</c> object. You are expected to call either the <c>wait</c> or <c>run_and_wait</c> method on the
    ///     object prior to the destructor executing, unless the destructor is executing as a result of stack unwinding due to an exception.
    /// </summary>
    /// <remarks>
    ///     If the destructor runs as the result of normal execution (for example, not stack unwinding due to an exception) and neither the <c>wait</c> nor
    ///     <c>run_and_wait</c> methods have been called, the destructor may throw a <see cref="missing_wait Class">missing_wait</see> exception.
    /// </remarks>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="structured_task_group::run_and_wait Method"/>
    /**/
    ~structured_task_group()
    {
    }

    /// <summary>
    ///     Schedules a task on the <c>structured_task_group</c> object. The caller manages the lifetime of the <c>task_handle</c> object passed
    ///     in the <paramref name="_Task_handle"/> parameter. The version that takes the parameter <paramref name="_Placement"/> causes the task
    ///     to be biased towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the work being scheduled. Note that the caller has responsibility for the lifetime of this object. The runtime will
    ///     continue to expect it to live until either the <c>wait</c> or <c>run_and_wait</c> method has been called on this
    ///     <c>structured_task_group</c> object.
    /// </param>
    /// <remarks>
    ///     The runtime creates a copy of the work function that you pass to this method. Any state changes that occur in a function object that you
    ///     pass to this method will not appear in your copy of that function object.
    ///     <para>If the <c>structured_task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>Throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task handle given by
    ///     the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c> method and there has been
    ///     no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="structured_task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<class _Function>
    void run(task_handle<_Function>& _Task_handle)
    {
        _Task_handle._SetRuntimeOwnsLifetime(false);
        _M_task_collection._Schedule(&_Task_handle);
    }

    /// <summary>
    ///     Schedules a task on the <c>structured_task_group</c> object. The caller manages the lifetime of the <c>task_handle</c> object passed
    ///     in the <paramref name="_Task_handle"/> parameter. The version that takes the parameter <paramref name="_Placement"/> causes the task
    ///     to be biased towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the work being scheduled. Note that the caller has responsibility for the lifetime of this object. The runtime will
    ///     continue to expect it to live until either the <c>wait</c> or <c>run_and_wait</c> method has been called on this
    ///     <c>structured_task_group</c> object.
    /// </param>
    /// <param name="_Placement">
    ///     A reference to the location where the task represented by the <paramref name="_Task_handle"/> parameter should execute.
    /// </param>
    /// <remarks>
    ///     The runtime creates a copy of the work function that you pass to this method. Any state changes that occur in a function object that you
    ///     pass to this method will not appear in your copy of that function object.
    ///     <para>If the <c>structured_task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>Throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task handle given by
    ///     the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c> method and there has been
    ///     no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="structured_task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<class _Function>
    void run(task_handle<_Function>& _Task_handle, location& _Placement)
    {
        _Task_handle._SetRuntimeOwnsLifetime(false);
        _M_task_collection._Schedule(&_Task_handle, &_Placement);
    }

    /// <summary>
    ///     Waits until all work on the <c>structured_task_group</c> has completed or is canceled.
    /// </summary>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>structured_task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>structured_task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>wait</c> method.</para>
    ///     <para>After this function returns, the <c>structured_task_group</c> object is considered in a final state and should not be used. Note that
    ///     utilization after the <c>wait</c> method returns will result in undefined behavior.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>run_and_wait</c> method before
    ///     the destructor of the <c>structured_task_group</c> executes.</para>
    /// </remarks>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="structured_task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /**/
    task_group_status wait()
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        return (task_group_status)_M_task_collection._Wait();
    }

    /// <summary>
    ///     Schedules a task to be run inline on the calling context with the assistance of the <c>structured_task_group</c> object for full
    ///     cancellation support. If a <c>task_handle</c> object is passed as a parameter to <c>run_and_wait</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The function then waits until all work on the
    ///     <c>structured_task_group</c> object has either completed or been canceled.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the task which will be run inline on the calling context. Note that the caller has responsibility for the lifetime of this object.
    ///     The runtime will continue to expect it to live until the <c>run_and_wait</c> method finishes execution.
    /// </param>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>structured_task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>structured_task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>run_and_wait</c> method.</para>
    ///     <para>After this function returns, the <c>structured_task_group</c> object is considered in a final state and should not be used.
    ///     Note that utilization after the <c>run_and_wait</c> method returns will result in undefined behavior.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>wait</c> method before
    ///     the destructor of the <c>structured_task_group</c> executes.</para>
    /// </remarks>
    /// <seealso cref="structured_task_group::run Method"/>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /**/
    template<class _Function>
    task_group_status run_and_wait(task_handle<_Function>& _Task_handle)
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
    }

    /// <summary>
    ///     Schedules a task to be run inline on the calling context with the assistance of the <c>structured_task_group</c> object for full
    ///     cancellation support. If a <c>task_handle</c> object is passed as a parameter to <c>run_and_wait</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The function then waits until all work on the
    ///     <c>structured_task_group</c> object has either completed or been canceled.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the task.
    /// </typeparam>
    /// <param name="_Func">
    ///     A function which will be called to invoke the body of the work. This may be a lambda or other object which supports
    ///     a version of the function call operator with the signature <c>void operator()()</c>.
    /// </param>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>structured_task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>structured_task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>run_and_wait</c> method.</para>
    ///     <para>After this function returns, the <c>structured_task_group</c> object is considered in a final state and should not be used.
    ///     Note that utilization after the <c>run_and_wait</c> method returns will result in undefined behavior.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>wait</c> method before
    ///     the destructor of the <c>structured_task_group</c> executes.</para>
    /// </remarks>
    /// <seealso cref="structured_task_group::run Method"/>
    /// <seealso cref="structured_task_group::wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /**/
    template<class _Function>
    task_group_status run_and_wait(const _Function& _Func)
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        task_handle<_Function> _Task(_Func);
        return (task_group_status)_M_task_collection._RunAndWait(&_Task);
    }

    /// <summary>
    ///     Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group
    ///     will get canceled transitively if possible.
    /// </summary>
    /// <remarks>
    ///     For more information, see <see cref="Cancellation in the PPL"/>.
    /// </remarks>
    /**/
    void cancel()
    {
        _M_task_collection._Cancel();
    }

    /// <summary>
    ///     Informs the caller whether or not the task group is currently in the midst of a cancellation. This
    ///     does not necessarily indicate that the <c>cancel</c> method was called on the <c>structured_task_group</c> object
    ///     (although such certainly qualifies this method to return <c>true</c>). It may be the case that the <c>structured_task_group</c> object
    ///     is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead
    ///     of time that cancellation will flow through this <c>structured_task_group</c> object, <c>true</c> will be returned as well.
    /// </summary>
    /// <returns>
    ///     An indication of whether the <c>structured_task_group</c> object is in the midst of a cancellation (or is guaranteed to be shortly).
    /// </returns>
    /// <remarks>
    ///     For more information, see <see cref="Cancellation in the PPL"/>.
    /// </remarks>
    /**/
    bool is_canceling()
    {
        return _M_task_collection._IsCanceling();
    }

private:

    // Disallow passing in an r-value for a task handle argument
    template<class _Function> void run(task_handle<_Function>&& _Task_handle);

    // The underlying group of tasks as known to the runtime.
    ::Concurrency::details::_StructuredTaskCollection _M_task_collection;
};

/// <summary>
///     The <c>task_group</c> class represents a collection of parallel work which can be waited on or canceled.
/// </summary>
/// <remarks>
///     Unlike the heavily restricted <c>structured_task_group</c> class, the <c>task_group</c> class is much more general construct.
///     It does not have any of the restrictions described by <see cref="structured_task_group Class">structured_task_group</see>. <c>task_group</c>
///     objects may safely be used across threads and utilized in free-form ways. The disadvantage of the <c>task_group</c> construct is that
///     it may not perform as well as the <c>structured_task_group</c> construct for tasks which perform small amounts of work.
///     <para>For more information, see <see cref="Task Parallelism"/>.</para>
/// </remarks>
/// <seealso cref="structured_task_group Class"/>
/// <seealso cref="task_handle Class"/>
/**/
class task_group
{
public:

    /// <summary>
    ///     Constructs a new <c>task_group</c> object.
    /// </summary>
    /// <remarks>
    ///     The constructor that takes a cancellation token creates a <c>task_group</c> that will be canceled when the source associated with
    ///     the token is canceled. Providing an explicit cancellation token also isolates this task group from participating in an implicit
    ///     cancellation from a parent group with a different token or no token.
    /// </remarks>
    /// <seealso cref="Task Parallelism"/>
    /**/
    task_group()
    {
    }
    /// <summary>
    ///     Constructs a new <c>task_group</c> object.
    /// </summary>
    /// <param name="_CancellationToken">
    ///     A cancellation token to associate with this task group. The task group will be canceled when the token is canceled.
    /// </param>
    /// <remarks>
    ///     The constructor that takes a cancellation token creates a <c>task_group</c> that will be canceled when the source associated with
    ///     the token is canceled. Providing an explicit cancellation token also isolates this task group from participating in an implicit
    ///     cancellation from a parent group with a different token or no token.
    /// </remarks>
    /// <seealso cref="Task Parallelism"/>
    /**/
    task_group(cancellation_token _CancellationToken) :
        _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : Concurrency::details::_CancellationTokenState::_None())
    {
    }

    /// <summary>
    ///     Destroys a <c>task_group</c> object. You are expected to call the either the <c>wait</c> or <c>run_and_wait</c> method on the object
    ///     prior to the destructor executing, unless the destructor is executing as the result of stack unwinding due to an exception.
    /// </summary>
    /// <remarks>
    ///     If the destructor runs as the result of normal execution (for example, not stack unwinding due to an exception) and neither the <c>wait</c> nor
    ///     <c>run_and_wait</c> methods have been called, the destructor may throw a <see cref="missing_wait Class">missing_wait</see> exception.
    /// </remarks>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="task_group::run_and_wait Method"/>
    /**/
    ~task_group()
    {
    }

    /// <summary>
    ///     Schedules a task on the <c>task_group</c> object. If a <c>task_handle</c> object is passed as a parameter to <c>run</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The version of the method that takes a reference to a function
    ///     object as a parameter involves heap allocation inside the runtime which may be perform less well than using the version that takes a
    ///     reference to a <c>task_handle</c> object. The version which takes the parameter <paramref name="_Placement"/> causes the task to be biased
    ///     towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Func">
    ///     A function which will be called to invoke the body of the task. This may be a lambda expression or other object which supports
    ///     a version of the function call operator with the signature <c>void operator()()</c>.
    /// </param>
    /// <remarks>
    ///     The runtime schedules the provided work function to run at a later time, which can be after the calling function returns.
    ///     This method uses a <see cref="task_handle Class">task_handle</see> object to hold a copy of the provided work function.
    ///     Therefore, any state changes that occur in a function object that you pass to this method will not appear in your copy of that function object.
    ///     In addition, make sure that the lifetime of any objects that you pass by pointer or by reference to the work function remain valid until
    ///     the work function returns.
    ///     <para>If the <c>task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>The method throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task
    ///     handle given by the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c>
    ///     method and there has been no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<typename _Function>
    void run(const _Function& _Func)
    {
        _M_task_collection._Schedule(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func));
    }

    /// <summary>
    ///     Schedules a task on the <c>task_group</c> object. If a <c>task_handle</c> object is passed as a parameter to <c>run</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The version of the method that takes a reference to a function
    ///     object as a parameter involves heap allocation inside the runtime which may be perform less well than using the version that takes a
    ///     reference to a <c>task_handle</c> object. The version which takes the parameter <paramref name="_Placement"/> causes the task to be biased
    ///     towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Func">
    ///     A function which will be called to invoke the body of the task. This may be a lambda expression or other object which supports
    ///     a version of the function call operator with the signature <c>void operator()()</c>.
    /// </param>
    /// <param name="_Placement">
    ///     A reference to the location where the task represented by the <paramref name="_Func"/> parameter should execute.
    /// </param>
    /// <remarks>
    ///     The runtime schedules the provided work function to run at a later time, which can be after the calling function returns.
    ///     This method uses a <see cref="task_handle Class">task_handle</see> object to hold a copy of the provided work function.
    ///     Therefore, any state changes that occur in a function object that you pass to this method will not appear in your copy of that function object.
    ///     In addition, make sure that the lifetime of any objects that you pass by pointer or by reference to the work function remain valid until
    ///     the work function returns.
    ///     <para>If the <c>task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>The method throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task
    ///     handle given by the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c>
    ///     method and there has been no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<typename _Function>
    void run(const _Function& _Func, location& _Placement)
    {
        _M_task_collection._Schedule(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func), &_Placement);
    }

    /// <summary>
    ///     Schedules a task on the <c>task_group</c> object. If a <c>task_handle</c> object is passed as a parameter to <c>run</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The version of the method that takes a reference to a function
    ///     object as a parameter involves heap allocation inside the runtime which may be perform less well than using the version that takes a
    ///     reference to a <c>task_handle</c> object. The version which takes the parameter <paramref name="_Placement"/> causes the task to be biased
    ///     towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the work being scheduled. Note that the caller has responsibility for the lifetime of this object. The runtime will
    ///     continue to expect it to live until either the <c>wait</c> or <c>run_and_wait</c> method has been called on this
    ///     <c>task_group</c> object.
    /// </param>
    /// <remarks>
    ///     The runtime schedules the provided work function to run at a later time, which can be after the calling function returns.
    ///     This method uses a <see cref="task_handle Class">task_handle</see> object to hold a copy of the provided work function.
    ///     Therefore, any state changes that occur in a function object that you pass to this method will not appear in your copy of that function object.
    ///     In addition, make sure that the lifetime of any objects that you pass by pointer or by reference to the work function remain valid until
    ///     the work function returns.
    ///     <para>If the <c>task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>The method throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task
    ///     handle given by the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c>
    ///     method and there has been no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<typename _Function>
    void run(task_handle<_Function>& _Task_handle)
    {
        _Task_handle._SetRuntimeOwnsLifetime(false);
        _M_task_collection._Schedule(&_Task_handle);
    }

    /// <summary>
    ///     Schedules a task on the <c>task_group</c> object. If a <c>task_handle</c> object is passed as a parameter to <c>run</c>, the caller is
    ///     responsible for managing the lifetime of the <c>task_handle</c> object. The version of the method that takes a reference to a function
    ///     object as a parameter involves heap allocation inside the runtime which may be perform less well than using the version that takes a
    ///     reference to a <c>task_handle</c> object. The version which takes the parameter <paramref name="_Placement"/> causes the task to be biased
    ///     towards executing at the location specified by that parameter.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the work being scheduled. Note that the caller has responsibility for the lifetime of this object. The runtime will
    ///     continue to expect it to live until either the <c>wait</c> or <c>run_and_wait</c> method has been called on this
    ///     <c>task_group</c> object.
    /// </param>
    /// <param name="_Placement">
    ///     A reference to the location where the task represented by the <paramref name="_Func"/> parameter should execute.
    /// </param>
    /// <remarks>
    ///     The runtime schedules the provided work function to run at a later time, which can be after the calling function returns.
    ///     This method uses a <see cref="task_handle Class">task_handle</see> object to hold a copy of the provided work function.
    ///     Therefore, any state changes that occur in a function object that you pass to this method will not appear in your copy of that function object.
    ///     In addition, make sure that the lifetime of any objects that you pass by pointer or by reference to the work function remain valid until
    ///     the work function returns.
    ///     <para>If the <c>task_group</c> destructs as the result of stack unwinding from an exception, you do not need to guarantee
    ///     that a call has been made to either the <c>wait</c> or <c>run_and_wait</c> method. In this case, the destructor will appropriately
    ///     cancel and wait for the task represented by the <paramref name="_Task_handle"/> parameter to complete.</para>
    ///     <para>The method throws an <see cref="invalid_multiple_scheduling Class">invalid_multiple_scheduling</see> exception if the task
    ///     handle given by the <paramref name="_Task_handle"/> parameter has already been scheduled onto a task group object via the <c>run</c>
    ///     method and there has been no intervening call to either the <c>wait</c> or <c>run_and_wait</c> method on that task group.</para>
    /// </remarks>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="task_group::run_and_wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /// <seealso cref="location class"/>
    /**/
    template<typename _Function>
    void run(task_handle<_Function>& _Task_handle, location& _Placement)
    {
        _Task_handle._SetRuntimeOwnsLifetime(false);
        _M_task_collection._Schedule(&_Task_handle, &_Placement);
    }

    /// <summary>
    ///     Waits until all work on the <c>task_group</c> object has either completed or been canceled.
    /// </summary>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>.
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>wait</c> method.</para>
    ///     <para>Calling <c>wait</c> on a <c>task_group</c> object resets it to a clean state where it can be reused. This includes the case
    ///     where the <c>task_group</c> object was canceled.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>run_and_wait</c> method before
    ///     the destructor of the <c>task_group</c> executes.</para>
    /// </remarks>
    /**/
    task_group_status wait()
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        return static_cast<task_group_status>(_M_task_collection._Wait());
    }

    /// <summary>
    ///     Schedules a task to be run inline on the calling context with the assistance of the <c>task_group</c> object for full cancellation support.
    ///     The function then waits until all work on the <c>task_group</c> object has either completed or been canceled. If a <c>task_handle</c> object
    ///     is passed as a parameter to <c>run_and_wait</c>, the caller is responsible for managing the lifetime of the <c>task_handle</c> object.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task handle.
    /// </typeparam>
    /// <param name="_Task_handle">
    ///     A handle to the task which will be run inline on the calling context. Note that the caller has responsibility for the lifetime of this object.
    ///     The runtime will continue to expect it to live until the <c>run_and_wait</c> method finishes execution.
    /// </param>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>.
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>run_and_wait</c> method.</para>
    ///     <para>Upon return from the <c>run_and_wait</c> method on a <c>task_group</c> object, the runtime resets the object to a clean state where it can be
    ///     reused. This includes the case where the <c>task_group</c> object was canceled.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>wait</c> method before
    ///     the destructor of the <c>task_group</c> executes.</para>
    /// </remarks>
    /// <seealso cref="task_group::run Method"/>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /**/
    template<class _Function>
    task_group_status run_and_wait(task_handle<_Function>& _Task_handle)
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        _Task_handle._SetRuntimeOwnsLifetime(false);
        return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
    }

    /// <summary>
    ///     Schedules a task to be run inline on the calling context with the assistance of the <c>task_group</c> object for full cancellation support.
    ///     The function then waits until all work on the <c>task_group</c> object has either completed or been canceled. If a <c>task_handle</c> object
    ///     is passed as a parameter to <c>run_and_wait</c>, the caller is responsible for managing the lifetime of the <c>task_handle</c> object.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to execute the body of the task.
    /// </typeparam>
    /// <param name="_Func">
    ///     A function which will be called to invoke the body of the work. This may be a lambda expression or other object which supports
    ///     a version of the function call operator with the signature <c>void operator()()</c>.
    /// </param>
    /// <returns>
    ///     An indication of whether the wait was satisfied or the task group was canceled, due to either an explicit cancel operation or an exception
    ///     being thrown from one of its tasks. For more information, see <see cref="task_group_status Enumeration">task_group_status</see>.
    /// </returns>
    /// <remarks>
    ///     Note that one or more of the tasks scheduled to this <c>task_group</c> object may execute inline on the calling context.
    ///     <para>If one or more of the tasks scheduled to this <c>task_group</c> object throws an exception, the
    ///     runtime will select one such exception of its choosing and propagate it out of the call to the <c>run_and_wait</c> method.</para>
    ///     <para>Upon return from the <c>run_and_wait</c> method on a <c>task_group</c> object, the runtime resets the object to a clean state where it can be
    ///     reused. This includes the case where the <c>task_group</c> object was canceled.</para>
    ///     <para>In the non-exceptional path of execution, you have a mandate to call either this method or the <c>wait</c> method before
    ///     the destructor of the <c>task_group</c> executes.</para>
    /// </remarks>
    /// <seealso cref="task_group::run Method"/>
    /// <seealso cref="task_group::wait Method"/>
    /// <seealso cref="Task Parallelism"/>
    /**/
    template<class _Function>
    task_group_status run_and_wait(const _Function& _Func)
    {
        //
        // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
        //
        return (task_group_status)_M_task_collection._RunAndWait(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func));
    }

    /// <summary>
    ///     Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group
    ///     will get canceled transitively if possible.
    /// </summary>
    /// <remarks>
    ///     For more information, see <see cref="Cancellation in the PPL"/>.
    /// </remarks>
    /**/
    void cancel()
    {
        _M_task_collection._Cancel();
    }

    /// <summary>
    ///     Informs the caller whether or not the task group is currently in the midst of a cancellation. This
    ///     does not necessarily indicate that the <c>cancel</c> method was called on the <c>task_group</c> object
    ///     (although such certainly qualifies this method to return <c>true</c>). It may be the case that the <c>task_group</c> object
    ///     is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead
    ///     of time that cancellation will flow through this <c>task_group</c> object, <c>true</c> will be returned as well.
    /// </summary>
    /// <returns>
    ///     An indication of whether the <c>task_group</c> object is in the midst of a cancellation (or is guaranteed to be shortly).
    /// </returns>
    /// <remarks>
    ///     For more information, see <see cref="Cancellation in the PPL"/>.
    /// </remarks>
    /**/
    bool is_canceling()
    {
        return _M_task_collection._IsCanceling();
    }

private:

    // Disallow passing in an r-value for a task handle argument
    template<class _Function> void run(task_handle<_Function>&& _Task_handle);

    // The underlying group of tasks as known to the runtime.
    ::Concurrency::details::_TaskCollection _M_task_collection;
};

/// <summary>
///     Executes a function object immediately and synchronously in the context of a given cancellation token.
/// </summary>
/// <typeparam name="_Function">
///     The type of the function object that will be invoked.
/// </typeparam>
/// <param name="_Func">
///     The function object which will be executed. This object must support the function call operator with a signature of void(void).
/// </param>
/// <param name="_Ct">
///     The cancellation token which will control implicit cancellation of the function object. Use <c>cancellation_token::none()</c> if you want the
///     function execute without any possibility of implicit cancellation from a parent task group being canceled.
/// </param>
/// <remarks>
///     Any interruption points in the function object will be triggered when the <c>cancellation_token</c> is canceled.
///     The explicit token <paramref name="_Ct"/> will isolate this <paramref name="_Func"/> from parent cancellation if the parent has a
///     different token or no token.
/// </remarks>
/**/
template<typename _Function>
void run_with_cancellation_token(const _Function& _Func, cancellation_token _Ct)
{
    structured_task_group _Stg(_Ct);
    _Stg.run_and_wait(_Func);
}

/// <summary>
///     Creates an interruption point for cancellation. If a cancellation is in progress in the context where this function is called, this will throw an internal
///     exception that aborts the execution of the currently executing parallel work. If cancellation is not in progress, the function does nothing.
/// </summary>
/// <remarks>
///     You should not catch the internal cancellation exception thrown by the <c>interruption_point()</c> function. The exception will be caught and handled by
///     the runtime, and catching it may cause your program to behave abnormally.
/// </remarks>
/**/
inline void interruption_point()
{
    structured_task_group _Stg;
    _Stg.wait();
}

/// <summary>
///     Returns an indication of whether the task group which is currently executing inline on the current context
///     is in the midst of an active cancellation (or will be shortly). Note that if there is no task group currently
///     executing inline on the current context, <c>false</c> will be returned.
/// </summary>
/// <returns>
///     <c>true</c> if the task group which is currently executing is canceling, <c>false</c> otherwise.
/// </returns>
/// <remarks>
///     For more information, see <see cref="Cancellation in the PPL"/>.
/// </remarks>
/// <seealso cref="task_group Class"/>
/// <seealso cref="structured_task_group Class"/>
/**/
_CRTIMP2 bool __cdecl is_current_task_group_canceling();

// Parallel Algorithms and Patterns

// Helper function that implements parallel_invoke with two functions
// Used by parallel_for and parallel_for_each implementations

template <typename _Function1, typename _Function2>
void _Parallel_invoke_impl(const _Function1& _Func1, const _Function2& _Func2)
{
    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    // We inline the last item to prevent the unnecessary push/pop on the work queue.
    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run_and_wait(_Task_handle2);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    _Parallel_invoke_impl(_Func1, _Func2);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run_and_wait(_Task_handle3);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run_and_wait(_Task_handle4);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run_and_wait(_Task_handle5);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function6">
///     The type of the sixth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <param name="_Func6">
///     The sixth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
    typename _Function6>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
    const _Function6& _Func6)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run(_Task_handle5);

    task_handle<_Function6> _Task_handle6(_Func6);
    _Task_group.run_and_wait(_Task_handle6);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function6">
///     The type of the sixth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function7">
///     The type of the seventh function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <param name="_Func6">
///     The sixth function object to be executed in parallel.
/// </param>
/// <param name="_Func7">
///     The seventh function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
    typename _Function6, typename _Function7>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
    const _Function6& _Func6, const _Function7& _Func7)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run(_Task_handle5);

    task_handle<_Function6> _Task_handle6(_Func6);
    _Task_group.run(_Task_handle6);

    task_handle<_Function7> _Task_handle7(_Func7);
    _Task_group.run_and_wait(_Task_handle7);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function6">
///     The type of the sixth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function7">
///     The type of the seventh function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function8">
///     The type of the eighth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <param name="_Func6">
///     The sixth function object to be executed in parallel.
/// </param>
/// <param name="_Func7">
///     The seventh function object to be executed in parallel.
/// </param>
/// <param name="_Func8">
///     The eighth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
    typename _Function6, typename _Function7, typename _Function8>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
    const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run(_Task_handle5);

    task_handle<_Function6> _Task_handle6(_Func6);
    _Task_group.run(_Task_handle6);

    task_handle<_Function7> _Task_handle7(_Func7);
    _Task_group.run(_Task_handle7);

    task_handle<_Function8> _Task_handle8(_Func8);
    _Task_group.run_and_wait(_Task_handle8);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function6">
///     The type of the sixth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function7">
///     The type of the seventh function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function8">
///     The type of the eighth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function9">
///     The type of the ninth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <param name="_Func6">
///     The sixth function object to be executed in parallel.
/// </param>
/// <param name="_Func7">
///     The seventh function object to be executed in parallel.
/// </param>
/// <param name="_Func8">
///     The eighth function object to be executed in parallel.
/// </param>
/// <param name="_Func9">
///     The ninth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
    typename _Function6, typename _Function7, typename _Function8, typename _Function9>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
    const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run(_Task_handle5);

    task_handle<_Function6> _Task_handle6(_Func6);
    _Task_group.run(_Task_handle6);

    task_handle<_Function7> _Task_handle7(_Func7);
    _Task_group.run(_Task_handle7);

    task_handle<_Function8> _Task_handle8(_Func8);
    _Task_group.run(_Task_handle8);

    task_handle<_Function9> _Task_handle9(_Func9);
    _Task_group.run_and_wait(_Task_handle9);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     Executes the function objects supplied as parameters in parallel, and blocks until they have finished executing. Each function object
///     could be a lambda expression, a pointer to function, or any object that supports the function call operator with the signature
///     <c>void operator()()</c>.
/// </summary>
/// <typeparam name="_Function1">
///     The type of the first function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function2">
///     The type of the second function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function3">
///     The type of the third function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function4">
///     The type of the fourth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function5">
///     The type of the fifth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function6">
///     The type of the sixth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function7">
///     The type of the seventh function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function8">
///     The type of the eighth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function9">
///     The type of the ninth function object to be executed in parallel.
/// </typeparam>
/// <typeparam name="_Function10">
///     The type of the tenth function object to be executed in parallel.
/// </typeparam>
/// <param name="_Func1">
///     The first function object to be executed in parallel.
/// </param>
/// <param name="_Func2">
///     The second function object to be executed in parallel.
/// </param>
/// <param name="_Func3">
///     The third function object to be executed in parallel.
/// </param>
/// <param name="_Func4">
///     The fourth function object to be executed in parallel.
/// </param>
/// <param name="_Func5">
///     The fifth function object to be executed in parallel.
/// </param>
/// <param name="_Func6">
///     The sixth function object to be executed in parallel.
/// </param>
/// <param name="_Func7">
///     The seventh function object to be executed in parallel.
/// </param>
/// <param name="_Func8">
///     The eighth function object to be executed in parallel.
/// </param>
/// <param name="_Func9">
///     The ninth function object to be executed in parallel.
/// </param>
/// <param name="_Func10">
///     The tenth function object to be executed in parallel.
/// </param>
/// <remarks>
///     Note that one or more of the function objects supplied as parameters may execute inline on the calling context.
///     <para>If one or more of the function objects passed as parameters to this function throws an exception, the
///     runtime will select one such exception of its choosing and propagate it out of the call to <c>parallel_invoke</c>.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
    typename _Function6, typename _Function7, typename _Function8, typename _Function9, typename _Function10>
void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
    const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9, const _Function10& _Func10)
{
    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

    structured_task_group _Task_group;

    task_handle<_Function1> _Task_handle1(_Func1);
    _Task_group.run(_Task_handle1);

    task_handle<_Function2> _Task_handle2(_Func2);
    _Task_group.run(_Task_handle2);

    task_handle<_Function3> _Task_handle3(_Func3);
    _Task_group.run(_Task_handle3);

    task_handle<_Function4> _Task_handle4(_Func4);
    _Task_group.run(_Task_handle4);

    task_handle<_Function5> _Task_handle5(_Func5);
    _Task_group.run(_Task_handle5);

    task_handle<_Function6> _Task_handle6(_Func6);
    _Task_group.run(_Task_handle6);

    task_handle<_Function7> _Task_handle7(_Func7);
    _Task_group.run(_Task_handle7);

    task_handle<_Function8> _Task_handle8(_Func8);
    _Task_group.run(_Task_handle8);

    task_handle<_Function9> _Task_handle9(_Func9);
    _Task_group.run(_Task_handle9);

    task_handle<_Function10> _Task_handle10(_Func10);
    _Task_group.run_and_wait(_Task_handle10);

    _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     The <c>auto_partitioner</c> class represents the default method <c>parallel_for</c>, <c>parallel_for_each</c> and
///     <c>parallel_transform</c> use to partition the range they iterates over. This method of partitioning employes range stealing
///     for load balancing as well as per-iterate cancellation.
/// </summary>
/**/
class auto_partitioner
{
public:
    /// <summary>
    ///     Constructs a <c>auto_partitioner</c> object.
    /// </summary>
    /**/
    auto_partitioner() {}

    /// <summary>
    ///     Destroys a <c>auto_partitioner</c> object.
    /// </summary>
    /**/
    ~auto_partitioner() {}

    template<class _Type>
    _Type _Get_num_chunks(_Type ) const
    {
        return static_cast<_Type>(Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors());
    }
};

/// <summary>
///     The <c>static_partitioner</c> class represents a static partitioning of the range iterated over by <c>parallel_for</c>. The partitioner
///     divides the range into as many chunks as there are workers available to the underyling scheduler.
/// </summary>
/**/
class static_partitioner
{
public:
    /// <summary>
    ///     Constructs a <c>static_partitioner</c> object.
    /// </summary>
    /**/
    static_partitioner()
    {
    }

    /// <summary>
    ///     Destroys a <c>static_partitioner</c> object.
    /// </summary>
    /**/
    ~static_partitioner() {}

    template<class _Type>
    _Type _Get_num_chunks(_Type ) const
    {
        return static_cast<_Type>(Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors());
    }
};

/// <summary>
///     The <c>simple_partitioner</c> class represents a static partitioning of the range iterated over by <c>parallel_for</c>. The partitioner
///     divides the range into chunks such that each chunk has at least the number of iterations specified by the chunk size.
/// </summary>
/**/
class simple_partitioner
{
private:
    typedef unsigned long long _Size_type;

public:
    /// <summary>
    ///     Constructs a <c>simple_partitioner</c> object.
    /// </summary>
    /// <param name="_M_chunk_size">
    ///     Number of iterations per chunk.
    /// </param>
    /**/
    explicit simple_partitioner(_Size_type _Chunk_size) : _M_chunk_size(_Chunk_size)
    {
        if (_Chunk_size == 0)
        {
            throw std::invalid_argument("_Chunk_size");
        }
    }

    /// <summary>
    ///     Destroys a <c>simple_partitioner</c> object.
    /// </summary>
    /**/
    ~simple_partitioner() {}

    template<class _Type>
    _Type _Get_num_chunks(_Type _Range_arg) const
    {
        static_assert(sizeof(_Type) <= sizeof(_Size_type), "Potential truncation of _Range_arg");
        _Size_type _Num_chunks = (static_cast<_Size_type>(_Range_arg) / _M_chunk_size);

        if (_Num_chunks == 0)
        {
           _Num_chunks = 1;
        }

        return static_cast<_Type>(_Num_chunks);
    }

private:

    _Size_type _M_chunk_size;
};

/// <summary>
///     The <c>affinity_partitioner</c> class is similar to the <c>static_partitioner</c> class, but it improves cache affinity
///     by its choice of mapping subranges to worker threads. It can improve performance significantly when a loop is re-executed over
///     the same data set, and the data fits in cache. Note that the same <c>affinity_partitioner</c> object must be used with subsequent
///     iterations of a parallel loop that is executed over a particular data set, to benefit from data locality.
/// </summary>
/**/
class affinity_partitioner
{
public:

    /// <summary>
    ///     Constructs an <c>affinity_partitioner</c> object.
    /// </summary>
    /**/
    affinity_partitioner() : _M_num_chunks(0), _M_pChunk_locations(NULL)
    {
    }

    /// <summary>
    ///     Destroys an <c>affinity_partitioner</c> object.
    /// </summary>
    /**/
    ~affinity_partitioner()
    {
        delete [] _M_pChunk_locations;
    }

    location& _Get_chunk_location(unsigned int _ChunkIndex)
    {
        return _M_pChunk_locations[_ChunkIndex];
    }

    template<class _Type>
    _Type _Get_num_chunks(_Type )
    {
        if (_M_num_chunks == 0)
        {
            _Initialize_locations(Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors());
        }

        return static_cast<_Type>(_M_num_chunks);
    }

private:
    // The number of chunks the partitioner will record affinity for.
    unsigned int _M_num_chunks;

    // Array of remembered locations.
    location * _M_pChunk_locations;

    void _Initialize_locations(unsigned int _Num_chunks)
    {
        _M_num_chunks = _Num_chunks;
        _M_pChunk_locations = new location[_Num_chunks];
    }
};

// Helper methods for scheduling and executing parallel tasks

// Disable C4180: qualifier applied to function type has no meaning; ignored
// Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
#pragma warning(push)
#pragma warning(disable: 4180)
// Disable C6263: using _alloca in a loop; this can quickly overflow stack
#pragma warning(disable: 6263)

// Template class that invokes user function on a parallel_for_each

template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
class _Parallel_chunk_helper_invoke
{
public:
    static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
    {
        _Func(_First[_Index]);
    }
};

// Template specialized class that invokes user function on a parallel_for

template <typename _Random_iterator, typename _Index_type, typename _Function>
class _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, false>
{
public:
    static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
    {
        _Func(static_cast<_Random_iterator>(_First + _Index));
    }
};

// Represents a range of iteration

template<typename _Index_type>
class _Range
{
public:

    // Construct an object for the range [_Current_iteration, _Last_iteration)
    _Range(_Index_type _Current_iteration, _Index_type _Last_iteration) :
      _M_current(_Current_iteration), _M_last(_Last_iteration)
    {
        // On creation, the range shall have at least 1 iteration.
        _CONCRT_ASSERT(_Number_of_iterations() > 0);
    }

    // Send a portion of the range to the helper
    void _Send_range(_Range<_Index_type> * _Helper_range)
    {
        // If there are no iterations other than the current one left until finish, there is no help
        // needed. Set the pointer to a special value that helper will understand and continue
        // doing the work.
        _Index_type _Remaining_iterations = _Number_of_iterations();
        if (_Remaining_iterations > 1)
        {
            // Compute the two pieces of the work range: one for the worker and one for helper class.
            _M_last_iteration = _M_current_iteration + _Remaining_iterations / 2;

            // There needs to be at least 1 iteration left because the current iteration cannot be sent.
            _CONCRT_ASSERT(_Number_of_iterations() > 0);
        }

        // This is also a signal for the helper that a range has been sent to it.
        _Helper_range->_M_current_iteration = _M_last_iteration;
    }

    // Steal the entire range and give it to the helper
    void _Steal_range(_Range<_Index_type>  * _Helper_range)
    {
        // We allow stealing only from a range that has atleast 1 iteration
        _CONCRT_ASSERT(_Number_of_iterations() > 0);

        _Index_type _Current_iter = _M_current_iteration;

        _Helper_range->_M_current_iteration = _Current_iter + 1;
        _Helper_range->_M_last_iteration = _M_last_iteration;

        _M_last_iteration = _Current_iter + 1;
    }

    // Returns the number of iterations in this range
    _Index_type _Number_of_iterations() const
    {
        return (_M_last_iteration - _M_current_iteration);
    }

    // Returns the current iteration in the range
    _Index_type _Get_current_iteration() const
    {
        return _M_current;
    }

    // Sets the current iteration in the range
    void _Set_current_iteration(const _Index_type _I)
    {
        _M_current = _I;
    }

    __declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration;

    // Returns the last iteration in the range
    _Index_type _Get_last_iteration() const
    {
        return _M_last;
    }

    // Sets the last iteration in the range
    void _Set_last_iteration(const _Index_type _I)
    {
        _M_last = _I;
    }

    __declspec(property(get=_Get_last_iteration, put=_Set_last_iteration)) _Index_type _M_last_iteration;

private:

    // These members are volatile because they are updated by the helper
    // and used by the worker.
    volatile _Index_type _M_current;
    volatile _Index_type _M_last;
};

// A proxy for the worker responsible for maintaining communication with the helper

template<typename _Index_type>
class _Worker_proxy
{
public:
    _Worker_proxy(_Worker_proxy *_PParent_worker = NULL) :
      _M_pHelper_range(NULL), _M_pParent_worker(_PParent_worker), _M_pWorker_range(NULL), _M_completion_count(0), _M_stop_iterating(0)
    {
        _M_context = Concurrency::details::_Context::_CurrentContext();
    }

    ~_Worker_proxy()
    {
        // Make the check to avoid doing extra work in the non-exceptional cases
        if (_M_completion_count != _Tree_Complete)
        {
            // On exception, notify our parent so it breaks out of its loop.
            _Propagate_cancel();

            // On exception, we need to set _M_completion_count to ensure that the helper breaks out of its spin wait.
            _Set_done();
        }
    }

    // Obtain a range from the worker
    bool _Receive_range(_Range<_Index_type> * _Helper_range)
    {
        // If the worker already finished, then there is no work left for the helper
        if (_M_completion_count)
        {
            return false;
        }

        _CONCRT_ASSERT(_Helper_range != NULL);

        // There are two special values for _M_current_iteration that are not valid: one is the
        // initial value of the working class which it will never share, and the other is
        // the last exclusive iteration of the working class, which has no work to be done.
        // We use the former value so that we can understand worker's response.
        _Index_type _Cached_first_iteration = _Helper_range->_M_current_iteration;

        // Following operation is not done via interlocked operation because it does not have to.
        // Helper lazily registers that it would like to help the worker, but it allows for some
        // time to elapse before that information has made it over to the worker. The idea
        // is not to disturb the worker if it is not necessary. It is possible to add interlocked
        // operation in the future if the time spent in the busy wait loop is too big.
        _CONCRT_ASSERT(_M_pHelper_range == NULL);
        _M_pHelper_range = _Helper_range;

        ::Concurrency::details::_SpinWaitBackoffNone spinWait(::Concurrency::details::_Context::_Yield);

        // If the worker is done, it will flush the store buffer and signal the helper by
        // changing _M_current_iteration in the helper's range.
        while ((_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
        {
            if ((_M_pWorker_range != NULL) && _M_context._IsSynchronouslyBlocked())
            {
                // Attempt to steal the entire range from the worker if it is synchronously blocked.

                // Make sure that worker makes no forward progress while helper is attempting to
                // steal its range. If worker does get unblocked, simply back off in the helper.
                // Note that there could be another helper running if a range has already been
                // sent to us.
                long _Stop_iterating = _InterlockedIncrement(&_M_stop_iterating);
                _CONCRT_ASSERT(_Stop_iterating > 0);

                // We need to make a local copy as the pointer could be changed by the worker.
                _Range<_Index_type> * _Worker_range = _M_pWorker_range;

                // The order of comparison needs to be preserved. If the parent is blocked, then
                // it cannot send a range (because _M_stop_iterating is already set). If it sent a range
                // before being synchronously blocked, then we are no longer the helper. Refrain
                // from intrusively stealing the range.
                if ((_Worker_range != NULL) && _M_context._IsSynchronouslyBlocked()
                    && (_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
                {
                    _CONCRT_ASSERT(_M_pHelper_range == _Helper_range);

                    _M_pHelper_range = NULL;
                    _Worker_range->_Steal_range(_Helper_range);

                    _CONCRT_ASSERT(_Helper_range->_M_current_iteration != _Cached_first_iteration);
                }

                // At this point, worker is either:
                //
                // a) no longer blocked so range will come to the helper naturally, or
                // b) out of iterations because helper stole all of it
                _Stop_iterating = _InterlockedDecrement(&_M_stop_iterating);
                _CONCRT_ASSERT(_Stop_iterating >= 0);
            }
            else
            {
                // If there is no work received in a full spin, then start yielding the context
                spinWait._SpinOnce();
            }
        }

        // If the initial iteration is the same as the original first iteration then the
        // worker class is sending the signal that it does not need any help.
        if (_Helper_range->_M_current_iteration == _Cached_first_iteration)
        {
            return false;
        }

        return (_Helper_range->_Number_of_iterations() > 0);
    }

    // Send a portion of our range and notify the helper.
    bool _Send_range(_Range<_Index_type> * _Worker_range)
    {
        // Worker range shall not be available for stealing at this time.
        _CONCRT_ASSERT(_M_pWorker_range == NULL);

        // Helper shall be registered.
        _CONCRT_ASSERT(_M_pHelper_range != NULL);

        // Send the range
        _Worker_range->_Send_range(_M_pHelper_range);

        // Notify the helper. The fence ensures that the prior updates are visible.
        _InterlockedExchangePointer(reinterpret_cast<void * volatile *>(&_M_pHelper_range), NULL);

        // The current iteration should still be left
        _CONCRT_ASSERT(_Worker_range->_Number_of_iterations() >= 1);

        // Indicate if we need another helper
        return (_Worker_range->_Number_of_iterations() > 1);
    }

    // Let the helper know that it is ok to intrusively steal range from the worker by publishing the
    // remaining range.
    void _Enable_intrusive_steal(_Range<_Index_type> * _Worker_range)
    {
        _M_pWorker_range = _Worker_range;
    }

    // Prevent the helper from intrusively stealing range from the worker
    void _Disable_intrusive_steal()
    {
        _M_pWorker_range = NULL;
        _Wait_on_intrusive_steal();
    }

    bool _Is_helper_registered()
    {
        return (_M_pHelper_range != NULL);
    }

    bool _Is_done()
    {
        return (_M_completion_count != 0);
    }

    void _Set_done()
    {
        // Let the helper know that this class is done with work and flush the store buffer. This operation
        // ensures that any buffered store to helper range in _Send_range is flushed and
        // available in _Receive_range (so there will be no lost ranges).
        _InterlockedExchange(&_M_completion_count, 1);
    }

    void _Set_tree_done()
    {
        // Make sure that **WE** know when our destructor hits that the entire tree is complete.
        _M_completion_count = _Tree_Complete;
    }

    bool _Is_beacon_signaled()
    {
        return _M_beacon._Is_signaled();
    }

    bool _Verify_beacon_cancellation()
    {
        return _M_beacon._Confirm_cancel();
    }

private:

    // Spin wait for any intrusive steal that is in progress.
    void _Wait_on_intrusive_steal()
    {
        // This code is used to synchronize with helper in case of worker cooperative blocking.
        if (_M_stop_iterating != 0)
        {
            ::Concurrency::details::_SpinWaitBackoffNone spinWait;

            while (_M_stop_iterating != 0)
            {
                spinWait._SpinOnce();
            }
        }
    }

    void _NotifyCancel()
    {
        _M_beacon._Raise();
    }

    void _Propagate_cancel()
    {
        if (_M_pParent_worker != NULL)
        {
            _M_pParent_worker->_NotifyCancel();
        }
    }

    // Constant indicating sub-tree completion
    static const long _Tree_Complete = 2;

    // Read in the loop
    _Range<_Index_type> * volatile               _M_pHelper_range;

    // Read at the end of the loop
    _Worker_proxy *                              _M_pParent_worker;

    // Written rarely
    ::Concurrency::details::_Cancellation_beacon _M_beacon;
    ::Concurrency::details::_Context             _M_context;

    volatile long                                _M_completion_count;

    // Written to in the loop
    _Range<_Index_type> * volatile               _M_pWorker_range;
    volatile long                                _M_stop_iterating;

    _Worker_proxy const & operator=(_Worker_proxy const&);    // no assignment operator

};

// parallel_for -- Performs parallel iteration over a range of indices from _First to _Last,
// excluding _Last. The order in which each iteration is executed is unspecified and non-deterministic.

// Closure (binding) classes for invoking parallel_for and parallel_for_each, with chunks

// A dynamically rebalancing closure class used for packaging parallel_for or parallel_for_each for invocation in chunks.
// If some tasks finish earlier than others, helper tasks get executed which ensures further distribution of work.

template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
class _Parallel_chunk_helper
{
public:
    _Parallel_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
        const _Function& _Func, const _Partitioner&, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
        _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func),
        _M_parent_worker(_Parent_data)
    {
        // Empty constructor because members are already assigned
    }

        // Constructor overload that accepts a range
    _Parallel_chunk_helper(const _Random_iterator& _First,  const _Index_type& _Step, const _Function& _Func,
        const _Range<_Index_type>& _Worker_range, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
        _M_first(_First), _M_first_iteration(_Worker_range._M_current_iteration), _M_last_iteration(_Worker_range._M_last_iteration), _M_step(_Step), _M_function(_Func),
        _M_parent_worker(_Parent_data)
    {
        // Empty constructor because members are already assigned
    }

    // The main helper function which iterates over the given collection and invokes user function on every iteration.
    // Function is marked as const even though it does mutate some of its members (those are declared as mutable). This is done
    // in order to easily communicate between a worker and a helper instance, without holding references to many local variables.
    // However, this function does not mutate any state that is visible to anyone outside of this class, nor would that be
    // possible due to the implicit copy of the functor that happens when a new task_handle is created.
    void operator()() const
    {
        _Range<_Index_type> _Worker_range(_M_first_iteration, _M_last_iteration);

        // This class has two modes: worker and helper. The originally split chunk is always a
        // worker, while any subsequent class spawned from this class is in the helper
        // mode, which is signified using a link to the worker class through _M_pOwning_worker
        // handle. So, it will wait for work to be dished out by the working class while in helper mode.
        if (_M_parent_worker != NULL && !_M_parent_worker->_Receive_range(&_Worker_range))
        {
            // If the worker class rejected the help, simply return
            return;
        }

        // Keep the secondary, scaled, loop index for quick indexing into the data structure
        _Index_type _Current_iteration = _Worker_range._M_current_iteration;
        _Index_type _Scaled_index = _Current_iteration * _M_step;

        // If there is only one iteration to be executed there is no need to initialize any
        // helper classes (work is indivisible).
        if (_Worker_range._Number_of_iterations() == 1)
        {
            // Execute one iteration
            _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);
            return;
        }

        // If the execution reaches this point it means that this class now has a chunk of work
        // that it needs to get done, so it has transitioned into the worker mode.
        structured_task_group _Helper_group;

        // Initialize fields that are needed in the helper
        _Worker_proxy<_Index_type> _Worker(_M_parent_worker);

        // Instantiate a helper class for this working class and put it on the work queue.
        // If some thread is idle it will be able to steal the helper and help this class
        // finish its work by stealing a piece of the work range.
        task_handle<_Parallel_chunk_helper> _Helper_task(_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));

        _Helper_group.run(_Helper_task);

        ::Concurrency::details::_MallocaListHolder<task_handle<_Parallel_chunk_helper>> _Holder;

        // Normally, for a cancellation semantic in cooperation with the helper, we would run_and_wait the below code on the Helper_group. Unfortunately,
        // the capture by reference of things which must be shared (_Worker, and so forth) will cause the loop below to add additional indirection
        // instructions. The loop below *MUST* be as tight as possible with the defined semantics. Instead, we will manually notify our parent if the
        // worker's destructor runs without hitting the bottom of our chunk. This is done through notification on the beacon.

        for (_Index_type _I = _Current_iteration; _I < _Worker_range._M_last_iteration; (_I++, _Worker_range._M_current_iteration =_I, _Scaled_index += _M_step))
        {
            if (_Worker._Is_beacon_signaled())
            {
                // Either a parent task group is canceled or one of the other iterations
                // threw an exception. Abort the remaining iterations
                //
                // Note that this could be a false positive that we must verify.
                if (_Worker._Is_done() || _Worker._Verify_beacon_cancellation())
                {
                    break;
                }
            }

            if (_Worker._Is_helper_registered())
            {
                // The helper class (there can only be one) registered to help this class with the work.
                // Thus, figure out if this class needs help and split the range among the two classes.

                if (_Worker._Send_range(&_Worker_range))
                {
                    // Construct every new instance of a helper class on the stack because it is beneficial to use
                    // a structured task group where the class itself is responsible for task handle's lifetime.
                    task_handle<_Parallel_chunk_helper> * _Helper_subtask = _Holder._AddRawMallocaNode(_malloca(_Holder._GetAllocationSize()));

                    new(_Helper_subtask) task_handle<_Parallel_chunk_helper>
                        (_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));

                    // If _Send_range returns true, that means that there is still some non-trivial
                    // work to be done, so this class will potentially need another helper.
                    _Helper_group.run(*_Helper_subtask);
                }
            }

            // Allow intrusive stealing by the helper
            _Worker._Enable_intrusive_steal(&_Worker_range);

            // Execute one iteration: the element is at scaled index away from the first element.
            _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);

            // Helper shall not steal a range after this call
            _Worker._Disable_intrusive_steal();
        }

        // Indicate that the worker is done with its iterations.
        _Worker._Set_done();

        // Wait for all worker/helper iterations to finish
        _Helper_group.wait();

        // Make sure that we've signaled that the tree is complete. This is used to detect any exception out of either _Parallel_chunk_helper_invoke or
        // _Helper_group.wait() above as a cancellation of the loop which must propagate upwards because we do not wrap the loop body in run_and_wait.
        _Worker._Set_tree_done();
    }

private:

    const _Random_iterator&            _M_first;
    const _Index_type&                 _M_step;
    const _Function&                   _M_function;

    const _Index_type                  _M_first_iteration;
    const _Index_type                  _M_last_iteration;

    _Worker_proxy<_Index_type> * const _M_parent_worker;

    _Parallel_chunk_helper const & operator=(_Parallel_chunk_helper const&);    // no assignment operator
};

template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
class _Parallel_fixed_chunk_helper
{
public:
    _Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration,
         _Index_type _Last_iteration, const _Index_type& _Step, const _Function& _Func, const _Partitioner&) :
        _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func)
    {
        // Empty constructor because members are already assigned
    }

    void operator()() const
    {
        // Keep the secondary, scaled, loop index for quick indexing into the data structure
        _Index_type _Scaled_index = _M_first_iteration * _M_step;

        for (_Index_type _I = _M_first_iteration; _I < _M_last_iteration; (_I++, _Scaled_index += _M_step))
        {
            // Execute one iteration: the element is at scaled index away from the first element.
            _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);
        }
    }
private:

    const _Random_iterator&            _M_first;
    const _Index_type&                 _M_step;
    const _Function&                   _M_function;

    const _Index_type                  _M_first_iteration;
    const _Index_type                  _M_last_iteration;

    _Parallel_fixed_chunk_helper const & operator=(_Parallel_fixed_chunk_helper const&);    // no assignment operator
};

template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
class _Parallel_localized_chunk_helper
{
public:
    typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator> _Base;

    _Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
        const _Function& _Func, affinity_partitioner& _Part) :
        _M_fixed_helper(_Chunk_index, _First, _First_iteration, _Last_iteration, _Step, _Func, static_partitioner()),
        _M_chunk_location(_Part._Get_chunk_location(static_cast<unsigned int>(_Chunk_index)))
    {
        // Empty constructor because members are already assigned
    }

    // Override the operator() in the base class. Note that this is not a virtual override.
    void operator()() const
    {
        // Check here if location needs to be saved.
        if (_M_chunk_location._Is_system())
        {
            _M_chunk_location = location::current();
        }

        _M_fixed_helper();
    }
private:

    location&       _M_chunk_location;
    _Base           _M_fixed_helper;
    _Parallel_localized_chunk_helper const & operator=(_Parallel_localized_chunk_helper const&);    // no assignment operator
};

#pragma warning(pop)

template <typename _Worker_class, typename _Index_type, typename Partitioner>
void _Parallel_chunk_task_group_run(structured_task_group& _Task_group,
                                    task_handle<_Worker_class>* _Chunk_helpers,
                                    const Partitioner&,
                                    _Index_type _I)
{
    _Task_group.run(_Chunk_helpers[_I]);
}

template <typename _Worker_class, typename _Index_type>
void _Parallel_chunk_task_group_run(structured_task_group& _Task_group,
                                    task_handle<_Worker_class>* _Chunk_helpers,
                                    affinity_partitioner& _Part,
                                    _Index_type _I)
{
     _Task_group.run(_Chunk_helpers[_I], _Part._Get_chunk_location(static_cast<unsigned int>(_I)));
}


// Helper functions that implement parallel_for

template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner>
void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func, _Partitioner&& _Part)
{
    _CONCRT_ASSERT(_Range_arg > 1);
    _CONCRT_ASSERT(_Step > 0);

    _Index_type _Num_iterations = (_Step == 1) ? _Range_arg : (((_Range_arg - 1) / _Step) + 1);
    _CONCRT_ASSERT(_Num_iterations > 1);

    _Index_type _Num_chunks = _Part._Get_num_chunks(_Num_iterations);
    _CONCRT_ASSERT(_Num_chunks > 0);

    // Allocate memory on the stack for task_handles to ensure everything is properly structured.
    ::Concurrency::details::_MallocaArrayHolder<task_handle<_Worker_class>> _Holder;
    task_handle<_Worker_class> * _Chunk_helpers = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker_class>) * static_cast<size_t>(_Num_chunks)));

    structured_task_group _Task_group;

    _Index_type _Iterations_per_chunk = _Num_iterations / _Num_chunks;
    _Index_type _Remaining_iterations = _Num_iterations % _Num_chunks;

    // If there are less iterations than desired chunks, set the chunk number
    // to be the number of iterations.
    if (_Iterations_per_chunk == 0)
    {
        _Num_chunks = _Remaining_iterations;
    }

    _Index_type _Work_size = 0;
    _Index_type _Start_iteration = 0;
    _Index_type _I;

    // Split the available work in chunks
    for (_I = 0; _I < _Num_chunks - 1; _I++)
    {
        if (_Remaining_iterations > 0)
        {
            // Iterations are not divided evenly, so add 1 remainder iteration each time
            _Work_size = _Iterations_per_chunk + 1;
            _Remaining_iterations--;
        }
        else
        {
            _Work_size = _Iterations_per_chunk;
        }

        // New up a task_handle "in-place", in the array preallocated on the stack
        new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
        _Holder._IncrementConstructedElemsCount();

        // Run each of the chunk tasks in parallel
        _Parallel_chunk_task_group_run(_Task_group, _Chunk_helpers, std::forward<_Partitioner>(_Part), _I);

        // Prepare for the next iteration
        _Start_iteration += _Work_size;
    }

    // Because this is the last iteration, then work size might be different
    _CONCRT_ASSERT((_Remaining_iterations == 0) || ((_Iterations_per_chunk == 0) && (_Remaining_iterations == 1)));
    _Work_size = _Num_iterations - _Start_iteration;

    // New up a task_handle "in-place", in the array preallocated on the stack
    new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
    _Holder._IncrementConstructedElemsCount();

    _Task_group.run_and_wait(_Chunk_helpers[_I]);
}

template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function>
void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func)
{
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, auto_partitioner());
}

// Helper for the parallel for API with the default dynamic partitioner which implements range-stealing to balance load.
template <typename _Index_type, typename _Diff_type, typename _Function>
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const auto_partitioner& _Part)
{
    typedef _Parallel_chunk_helper<_Index_type, _Diff_type, _Function, auto_partitioner, false> _Worker_class;
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

// Helper for the parallel_for API with a static partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
template <typename _Index_type, typename _Diff_type, typename _Function>
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const static_partitioner& _Part)
{
    typedef _Parallel_fixed_chunk_helper<_Index_type, _Diff_type, _Function, static_partitioner, false> _Worker_class;
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

// Helper for the parallel_for API with a simple partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
template <typename _Index_type, typename _Diff_type, typename _Function>
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const simple_partitioner& _Part)
{
    typedef _Parallel_fixed_chunk_helper<_Index_type, _Diff_type, _Function, simple_partitioner, false> _Worker_class;
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

// Helper for the parallel_for API with an affinity partitioner - creates a fixed number of chunks up front with no range-stealing enabled. subsequent
// calls to parallel_for with the same affinity partitioner (pass in as a non-const reference) are scheduled to the same location they previously ran on
template <typename _Index_type, typename _Diff_type, typename _Function>
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, affinity_partitioner& _Part)
{
    typedef _Parallel_localized_chunk_helper<_Index_type, _Diff_type, _Function, false> _Worker_class;
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

template <typename _Index_type, typename _Function, typename _Partitioner>
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
{
    // The step argument must be 1 or greater; otherwise it is an invalid argument
    if (_Step < 1)
    {
        throw std::invalid_argument("_Step");
    }

    // If there are no elements in this range we just return
    if (_First >= _Last)
    {
        return;
    }

    // Compute the difference type based on the arguments and avoid signed overflow for int, long, and long long
    typedef typename std::tr1::conditional<std::tr1::is_same<_Index_type, int>::value, unsigned int,
        typename std::tr1::conditional<std::tr1::is_same<_Index_type, long>::value, unsigned long,
            typename std::tr1::conditional<std::tr1::is_same<_Index_type, long long>::value, unsigned long long, decltype(_Last - _First)
            >::type
        >::type
    >::type _Diff_type;

    _Diff_type _Range_size = _Diff_type(_Last) - _Diff_type(_First);
    _Diff_type _Diff_step = _Step;

    if (_Range_size <= _Diff_step)
    {
        _Func(_First);
    }
    else
    {
        _Parallel_for_partitioned_impl<_Index_type, _Diff_type, _Function>(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
    }
}

template <typename _Index_type, typename _Function>
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
{
    _Parallel_for_impl(_First, _Last, _Step, _Func, auto_partitioner());
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <typeparam name="_Partitioner">
///     The type of the partitioner that is used to partition the supplied range.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Step">
///     The value by which to step when iterating from <paramref name="_First"/> to <paramref name="_Last"/>. The step must be positive.
///     <see cref="invalid_argument Class">invalid_argument</see> is thrown if the step is less than 1.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function, typename _Partitioner>
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
{
    _Trace_ppl_function(PPLParallelForEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
    _Parallel_for_impl(_First, _Last, _Step, _Func, std::forward<_Partitioner>(_Part));
    _Trace_ppl_function(PPLParallelForEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration. <paramref name="_Index_type"/> must be an integral type.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Step">
///     The value by which to step when iterating from <paramref name="_First"/> to <paramref name="_Last"/>. The step must be positive.
///     <see cref="invalid_argument Class">invalid_argument</see> is thrown if the step is less than 1.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function>
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
{
    parallel_for(_First, _Last, _Step, _Func, auto_partitioner());
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function>
void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const auto_partitioner& _Part = auto_partitioner())
{
    parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function>
void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const static_partitioner& _Part)
{
    parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function>
void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const simple_partitioner& _Part)
{
    parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
}

/// <summary>
///     <c>parallel_for</c> iterates over a range of indices and executes a user-supplied function at each iteration, in parallel.
/// </summary>
/// <typeparam name="_Index_type">
///     The type of the index being used for the iteration.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be executed at each iteration.
/// </typeparam>
/// <param name="_First">
///     The first index to be included in the iteration.
/// </param>
/// <param name="_Last">
///     The index one past the last index to be included in the iteration.
/// </param>
/// <param name="_Func">
///     The function to be executed at each iteration. This may be a lambda expression, a function pointer, or any object
///     that supports a version of the function call operator with the signature
///     <c>void operator()(</c><typeparamref name="_Index_type"/><c>)</c>.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     For more information, see <see cref="Parallel Algorithms"/>.
/// </remarks>
/**/
template <typename _Index_type, typename _Function>
void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, affinity_partitioner& _Part)
{
    parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
}

// parallel_for_each -- This function will iterate over all elements in the iterator's range.

// Closure (binding) classes for invoking parallel_for_each recursively

// A closure class used for packaging chunk of elements in parallel_for_each for parallel invocation

// Forward iterator for_each using unstructured task group

// Disable C4180: qualifier applied to function type has no meaning; ignored
// Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
#pragma warning(push)
#pragma warning(disable: 4180)

template <typename _Forward_iterator, typename _Function, unsigned int _Chunk_size>
class _Parallel_for_each_helper
{
public:
    typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
    static const unsigned int _Size = _Chunk_size;

    _Parallel_for_each_helper(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func) :
        _M_function(_Func), _M_len(0)
    {
        static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
        // Add a batch of work items to this functor's array
        for (unsigned int _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
        {
            _M_element[_M_len++] = &(*_First++);
        }
    }

    void operator()() const
    {
        // Invoke parallel_for on the batched up array of elements
        _Parallel_for_impl(0U, _M_len, 1U,
            [this] (unsigned int _Index)
            {
                _M_function(*(_M_element[_Index]));
            }
        );
    }

private:

    const _Function& _M_function;
    typename std::iterator_traits<_Forward_iterator>::pointer    _M_element[_Size];
    unsigned int     _M_len;

    _Parallel_for_each_helper const & operator=(_Parallel_for_each_helper const&);    // no assignment operator
};

#pragma warning(pop)

// Helper functions that implement parallel_for_each

template <typename _Forward_iterator, typename _Function>
void _Parallel_for_each_chunk(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
{
    // The chunk size selection depends more on the internal implementation of parallel_for than
    // on the actual input. Also, it does not have to be dynamically computed, but it helps
    // parallel_for if it is a power of 2 (easy to divide).
    const unsigned int _Chunk_size = 1024;

    // This functor will be copied on the heap and will execute the chunk in parallel
    _Parallel_for_each_helper<_Forward_iterator, _Function, _Chunk_size> _Functor(_First, _Last, _Func);

    // Because this is an unstructured task group, running the task will make a copy of the necessary data
    // on the heap, ensuring that it is available at the time of execution.
    _Task_group.run(_Functor);
}

template <typename _Forward_iterator, typename _Function>
void _Parallel_for_each_forward_impl(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
{
    _Parallel_for_each_chunk(_First, _Last, _Func, _Task_group);

    // If there is a tail, push the tail
    if (_First != _Last)
    {
        _Task_group.run(
            [&_First, &_Last, &_Func, &_Task_group]
            {
                Concurrency::_Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
            }
        );
    }
}

template <typename _Forward_iterator, typename _Function>
void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func, const auto_partitioner&, std::forward_iterator_tag)
{
    // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
    // it is up to the user to provide valid range.
    if (_First != _Last)
    {
        task_group _Task_group;

        _Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);

        _Task_group.wait();
    }
}

template <typename _Random_iterator, typename _Index_type, typename _Function>
void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const auto_partitioner& _Part)
{
    typedef _Parallel_chunk_helper<_Random_iterator, _Index_type, _Function, auto_partitioner, true> _Worker_class;
        // Use the same function that schedules work for parallel for
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

template <typename _Random_iterator, typename _Index_type, typename _Function>
void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const static_partitioner& _Part)
{
    typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, static_partitioner, true> _Worker_class;
    // Use the same function that schedules work for parallel for
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

template <typename _Random_iterator, typename _Index_type, typename _Function>
void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const simple_partitioner& _Part)
{
    typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, simple_partitioner, true> _Worker_class;
    // Use the same function that schedules work for parallel for
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

template <typename _Random_iterator, typename _Index_type, typename _Function>
void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, affinity_partitioner& _Part)
{
    typedef _Parallel_localized_chunk_helper<_Random_iterator, _Index_type, _Function, true> _Worker_class;
        // Use the same function that schedules work for parallel for
    _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
}

template <typename _Random_iterator, typename _Function, typename _Partitioner>
void _Parallel_for_each_impl(const _Random_iterator& _First, const _Random_iterator& _Last, const _Function& _Func, _Partitioner&& _Part, std::random_access_iterator_tag)
{
    typedef typename std::iterator_traits<_Random_iterator>::difference_type _Index_type;

    // Exit early if there is nothing in the collection
    if (_First >= _Last)
    {
        return;
    }

    _Index_type _Range_size = _Last - _First;

    if (_Range_size == 1)
    {
        _Func(*_First);
    }
    else
    {
        _Index_type _Step = 1;

        _Parallel_for_each_partitioned_impl(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
   }
}

/// <summary>
///     <c>parallel_for_each</c> applies a specified function to each element within a range, in parallel. It is semantically
///     equivalent to the <c>for_each</c> function in the <c>std</c> namespace, except that iteration over the elements is
///     performed in parallel, and the order of iteration is unspecified. The argument <paramref name="_Func"/> must support
///     a function call operator of the form <c>operator()(T)</c> where the parameter <paramref name="T"/> is the item type
///     of the container being iterated over.
/// </summary>
/// <typeparam name="_Iterator">
///     The type of the iterator being used to iterate over the container.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be applied to each element within the range.
/// </typeparam>
/// <param name="_First">
///     An iterator addressing the position of the first element to be included in parallel iteration.
/// </param>
/// <param name="_Last">
///     An iterator addressing the position one past the final element to be included in parallel iteration.
/// </param>
/// <param name="_Func">
///     A user-defined function object that is applied to each element in the range.
/// </param>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overload without an explicit partitioner.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Iterator, typename _Function>
void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func)
{
    parallel_for_each(_First, _Last, _Func, auto_partitioner());
}

/// <summary>
///     <c>parallel_for_each</c> applies a specified function to each element within a range, in parallel. It is semantically
///     equivalent to the <c>for_each</c> function in the <c>std</c> namespace, except that iteration over the elements is
///     performed in parallel, and the order of iteration is unspecified. The argument <paramref name="_Func"/> must support
///     a function call operator of the form <c>operator()(T)</c> where the parameter <paramref name="T"/> is the item type
///     of the container being iterated over.
/// </summary>
/// <typeparam name="_Iterator">
///     The type of the iterator being used to iterate over the container.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the function that will be applied to each element within the range.
/// </typeparam>
/// <param name="_First">
///     An iterator addressing the position of the first element to be included in parallel iteration.
/// </param>
/// <param name="_Last">
///     An iterator addressing the position one past the final element to be included in parallel iteration.
/// </param>
/// <param name="_Func">
///     A user-defined function object that is applied to each element in the range.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overload without an explicit partitioner.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Iterator, typename _Function, typename _Partitioner>
void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func, _Partitioner&& _Part)
{
    _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
    _Parallel_for_each_impl(_First, _Last, _Func, std::forward<_Partitioner>(_Part), std::_Iter_cat(_First));
    _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
}

// Disable C4180: qualifier applied to function type has no meaning; ignored
// Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
#pragma warning(push)
#pragma warning(disable: 4180)

/// <summary>
///     Computes the sum of all elements in a specified range by computing successive partial sums, or computes the result of successive partial
///     results similarly obtained from using a specified binary operation other than sum, in parallel. <c>parallel_reduce</c> is semantically similar to
///     <c>std::accumulate</c>, except that it requires the binary operation to be associative, and requires an identity value instead of an initial value.
/// </summary>
/// <typeparam name="_Forward_iterator">
///     The iterator type of input range.
/// </typeparam>
/// <param name="_Begin">
///     An input iterator addressing the first element in the range to be reduced.
/// </param>
/// <param name="_End">
///     An input iterator addressing the element that is one position beyond the final element in the range to be reduced.
/// </param>
/// <param name="_Identity">
///     The identity value <paramref name="_Identity"/> is of the same type as the result type of the reduction and also the <c>value_type</c> of the iterator
///     for the first and second overloads. For the third overload, the identity value must have the same type as the result type of the reduction, but can be
///     different from the <c>value_type</c> of the iterator. It must have an appropriate value such that the range reduction operator <paramref name="_Range_fun"/>,
///     when applied to a range of a single element of type <c>value_type</c> and the identity value, behaves like a type cast of the value from type
///     <c>value_type</c> to the identity type.
/// </param>
/// <returns>
///     The result of the reduction.
/// </returns>
/// <remarks>
///     To perform a parallel reduction, the function divides the range into chunks based on the number of workers available to the underlying
///     scheduler. The reduction takes place in two phases, the first phase performs a reduction within each chunk, and the second phase performs
///     a reduction between the partial results from each chunk.
///     <para>The first overload requires that the iterator's <c>value_type</c>, <c>T</c>, be the same as the identity value type as well as the reduction
///     result type. The element type T must provide the operator <c>T T::operator + (T)</c> to reduce elements in each chunk. The same operator is
///     used in the second phase as well.</para>
///     <para>The second overload also requires that the iterator's <c>value_type</c> be the same as the identity value type as well as the reduction
///     result type. The supplied binary operator <paramref name="_Sym_fun"/> is used in both reduction phases, with the identity value as the initial
///     value for the first phase.</para>
///     <para>For the third overload, the identity value type must be the same as the reduction result type, but the iterator's <c>value_type</c> may be
///     different from both. The range reduction function <paramref name="_Range_fun"/> is used in the first phase with the identity
///     value as the initial value, and the binary function <paramref name="_Sym_reduce_fun"/> is applied to sub results in the second phase.</para>
/// </remarks>
/**/
template<typename _Forward_iterator>
inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(
    _Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity)
{
    return parallel_reduce(_Begin, _End, _Identity, std::plus<typename std::iterator_traits<_Forward_iterator>::value_type>());
}

/// <summary>
///     Computes the sum of all elements in a specified range by computing successive partial sums, or computes the result of successive partial
///     results similarly obtained from using a specified binary operation other than sum, in parallel. <c>parallel_reduce</c> is semantically similar to
///     <c>std::accumulate</c>, except that it requires the binary operation to be associative, and requires an identity value instead of an initial value.
/// </summary>
/// <typeparam name="_Forward_iterator">
///     The iterator type of input range.
/// </typeparam>
/// <typeparam name="_Sym_reduce_fun">
///     The type of the symmetric reduction function. This must be a function type with signature <c>_Reduce_type _Sym_fun(_Reduce_type, _Reduce_type)</c>, where
///     _Reduce_type is the same as the identity type and the result type of the reduction. For the third overload, this should be consistent
///     with the output type of <c>_Range_reduce_fun</c>.
/// </typeparam>
/// <param name="_Begin">
///     An input iterator addressing the first element in the range to be reduced.
/// </param>
/// <param name="_End">
///     An input iterator addressing the element that is one position beyond the final element in the range to be reduced.
/// </param>
/// <param name="_Identity">
///     The identity value <paramref name="_Identity"/> is of the same type as the result type of the reduction and also the <c>value_type</c> of the iterator
///     for the first and second overloads. For the third overload, the identity value must have the same type as the result type of the reduction, but can be
///     different from the <c>value_type</c> of the iterator. It must have an appropriate value such that the range reduction operator <paramref name="_Range_fun"/>,
///     when applied to a range of a single element of type <c>value_type</c> and the identity value, behaves like a type cast of the value from type
///     <c>value_type</c> to the identity type.
/// </param>
/// <param name="_Sym_fun">
///     The symmetric function that will be used in the second of the reduction. Refer to Remarks for more information.
/// </param>
/// <returns>
///     The result of the reduction.
/// </returns>
/// <remarks>
///     To perform a parallel reduction, the function divides the range into chunks based on the number of workers available to the underlying
///     scheduler. The reduction takes place in two phases, the first phase performs a reduction within each chunk, and the second phase performs
///     a reduction between the partial results from each chunk.
///     <para>The first overload requires that the iterator's <c>value_type</c>, <c>T</c>, be the same as the identity value type as well as the reduction
///     result type. The element type T must provide the operator <c>T T::operator + (T)</c> to reduce elements in each chunk. The same operator is
///     used in the second phase as well.</para>
///     <para>The second overload also requires that the iterator's <c>value_type</c> be the same as the identity value type as well as the reduction
///     result type. The supplied binary operator <paramref name="_Sym_fun"/> is used in both reduction phases, with the identity value as the initial
///     value for the first phase.</para>
///     <para>For the third overload, the identity value type must be the same as the reduction result type, but the iterator's <c>value_type</c> may be
///     different from both. The range reduction function <paramref name="_Range_fun"/> is used in the first phase with the identity
///     value as the initial value, and the binary function <paramref name="_Sym_reduce_fun"/> is applied to sub results in the second phase.</para>
/// </remarks>
/**/
template<typename _Forward_iterator, typename _Sym_reduce_fun>
inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End,
    const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity, _Sym_reduce_fun _Sym_fun)
{
    typedef typename std::remove_cv<typename std::iterator_traits<_Forward_iterator>::value_type>::type _Reduce_type;

    return parallel_reduce(_Begin, _End, _Identity,
        [_Sym_fun](_Forward_iterator _Begin, _Forward_iterator _End, _Reduce_type _Init)->_Reduce_type
        {
            while (_Begin != _End)
            {
                _Init = _Sym_fun(_Init, *_Begin++);
            }

            return _Init;
        },
        _Sym_fun);
}

template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
struct _Reduce_functor_helper;

template<typename _Ty, typename _Sym_fun>
class _Order_combinable;

/// <summary>
///     Computes the sum of all elements in a specified range by computing successive partial sums, or computes the result of successive partial
///     results similarly obtained from using a specified binary operation other than sum, in parallel. <c>parallel_reduce</c> is semantically similar to
///     <c>std::accumulate</c>, except that it requires the binary operation to be associative, and requires an identity value instead of an initial value.
/// </summary>
/// <typeparam name="_Reduce_type">
///     The type that the input will reduce to, which can be different from the input element type.
///     The return value and identity value will has this type.
/// </typeparam>
/// <typeparam name="_Forward_iterator">
///     The iterator type of input range.
/// </typeparam>
/// <typeparam name="_Range_reduce_fun">
///     The type of the range reduction function. This must be a function type with signature <c>_Reduce_type _Range_fun(_Forward_iterator, _Forward_iterator, _Reduce_type)</c>,
///     _Reduce_type is the same as the identity type and the result type of the reduction.
/// </typeparam>
/// <typeparam name="_Sym_reduce_fun">
///     The type of the symmetric reduction function. This must be a function type with signature <c>_Reduce_type _Sym_fun(_Reduce_type, _Reduce_type)</c>, where
///     _Reduce_type is the same as the identity type and the result type of the reduction. For the third overload, this should be consistent
///     with the output type of <c>_Range_reduce_fun</c>.
/// </typeparam>
/// <param name="_Begin">
///     An input iterator addressing the first element in the range to be reduced.
/// </param>
/// <param name="_End">
///     An input iterator addressing the element that is one position beyond the final element in the range to be reduced.
/// </param>
/// <param name="_Identity">
///     The identity value <paramref name="_Identity"/> is of the same type as the result type of the reduction and also the <c>value_type</c> of the iterator
///     for the first and second overloads. For the third overload, the identity value must have the same type as the result type of the reduction, but can be
///     different from the <c>value_type</c> of the iterator. It must have an appropriate value such that the range reduction operator <paramref name="_Range_fun"/>,
///     when applied to a range of a single element of type <c>value_type</c> and the identity value, behaves like a type cast of the value from type
///     <c>value_type</c> to the identity type.
/// </param>
/// <param name="_Range_fun">
///     The function that will be used in the first phase of the reduction. Refer to Remarks for more information.
/// </param>
/// <param name="_Sym_fun">
///     The symmetric function that will be used in the second of the reduction. Refer to Remarks for more information.
/// </param>
/// <returns>
///     The result of the reduction.
/// </returns>
/// <remarks>
///     To perform a parallel reduction, the function divides the range into chunks based on the number of workers available to the underlying
///     scheduler. The reduction takes place in two phases, the first phase performs a reduction within each chunk, and the second phase performs
///     a reduction between the partial results from each chunk.
///     <para>The first overload requires that the iterator's <c>value_type</c>, <c>T</c>, be the same as the identity value type as well as the reduction
///     result type. The element type T must provide the operator <c>T T::operator + (T)</c> to reduce elements in each chunk. The same operator is
///     used in the second phase as well.</para>
///     <para>The second overload also requires that the iterator's <c>value_type</c> be the same as the identity value type as well as the reduction
///     result type. The supplied binary operator <paramref name="_Sym_fun"/> is used in both reduction phases, with the identity value as the initial
///     value for the first phase.</para>
///     <para>For the third overload, the identity value type must be the same as the reduction result type, but the iterator's <c>value_type</c> may be
///     different from both. The range reduction function <paramref name="_Range_fun"/> is used in the first phase with the identity
///     value as the initial value, and the binary function <paramref name="_Sym_reduce_fun"/> is applied to sub results in the second phase.</para>
/// </remarks>
/**/
template<typename _Reduce_type, typename _Forward_iterator, typename _Range_reduce_fun, typename _Sym_reduce_fun>
inline _Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type& _Identity,
    const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
{
    typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;

    static_assert(!std::tr1::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::input_iterator_tag>::value
        && !std::tr1::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::output_iterator_tag>::value,
        "iterator can not be input_iterator or output_iterator.");

    return _Parallel_reduce_impl(_Begin, _End,
        _Reduce_functor_helper<_Reduce_type, _Range_reduce_fun,
            _Order_combinable<_Reduce_type, _Sym_reduce_fun>>(_Identity, _Range_fun, _Order_combinable<_Reduce_type, _Sym_reduce_fun>(_Sym_fun)),
        typename std::iterator_traits<_Forward_iterator>::iterator_category());
}

// Ordered serial combinable object
template<typename _Ty, typename _Sym_fun>
class _Order_combinable
{
public:
    // Only write once, limited contention will be caused
    struct _Bucket
    {
        // Allocate enough space in the Bucket to hold a value
        char _Value[(sizeof(_Ty) / sizeof(char))];
        _Bucket * _Next;

        _Bucket(_Bucket *_N)
        {
            _Next = _N;
        }

        void _Insert(_Bucket *_Item)
        {
            // No need to lock, only one thread will insert
            _Item->_Next = _Next;
            _Next = _Item;
        }

        // Construct value in bucket
        void _Put(const _Ty &_Cur)
        {
            new(reinterpret_cast<_Ty *>(&_Value)) _Ty(_Cur);
        }
    };

private:
    const _Sym_fun &_M_fun;
    size_t _M_number;
    _Bucket *_M_root;
    _Order_combinable &operator =(const _Order_combinable &other);

public:
    _Bucket *_Construct(_Bucket *_Par = 0)
    {
        _Bucket * _Ret = static_cast<_Bucket *>(Concurrency::Alloc(sizeof(_Bucket)));
        return new(_Ret)_Bucket(_Par);
    }

    _Order_combinable(const _Sym_fun &_Fun): _M_fun(_Fun)
    {
        _M_root = 0;
        _M_number = 0;
    }

    ~_Order_combinable()
    {
        while (_M_root)
        {
            _Bucket *_Cur = _M_root;
            _M_root = _M_root->_Next;
            reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
            Concurrency::Free(_Cur);
        }
    }

    // Serially combine and release the list, return result
    _Ty _Serial_combine_release()
    {
        _Ty _Ret(reinterpret_cast<_Ty &>(_M_root->_Value));
        _Bucket *_Cur = _M_root;
        _M_root = _M_root->_Next;

        while (_M_root)
        {
            reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
            Concurrency::Free(_Cur);
            _Cur = _M_root;
            _Ret = _M_fun(reinterpret_cast <_Ty &> (_Cur->_Value), _Ret);
            _M_root = _M_root->_Next;
        }

        reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
        Concurrency::Free(_Cur);

        return _Ret;
    }

    // allocate a bucket and push back to the list
    _Bucket *_Unsafe_push_back()
    {
        return _M_root = _Construct(_M_root);
    }
};

// Implementation for the parallel reduce
template <typename _Forward_iterator, typename _Function>
typename _Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func,
    std::forward_iterator_tag)
{
    // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
    // it is up to the user to provide valid range.
    if (_First != _Last)
    {
        task_group _Task_group;
        _Parallel_reduce_forward_executor(_First, _Last, _Func, _Task_group);
        _Task_group.wait();
        return _Func._Combinable._Serial_combine_release();
    }
    else
    {
        return _Func._Identity_value;
    }
}

template<typename _Forward_iterator, typename _Functor>
class _Parallel_reduce_fixed_worker;

template <typename _Worker, typename _Random_iterator, typename _Function>
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun);

template <typename _Random_iterator, typename _Function>
typename _Function::_Reduce_type _Parallel_reduce_impl(_Random_iterator _First, _Random_iterator _Last, const _Function& _Func,
    std::random_access_iterator_tag)
{
    typedef _Parallel_reduce_fixed_worker<_Random_iterator, _Function> _Worker_class;

    // Special case for 0, 1 element
    if (_First >= _Last)
    {
        return _Func._Identity_value;
    }
    // Directly compute if size is too small
    else if (_Last - _First <= 1)
    {
        _Worker_class(_First, _Last, _Func)();
        return _Func._Combinable._Serial_combine_release();
    }
    else
    {
        // Use fixed ordered chunk partition to schedule works
        _Parallel_reduce_random_executor<_Worker_class>(_First, _Last, _Func);
        return _Func._Combinable._Serial_combine_release();
    }
}

// Helper function assemble all functors
template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
struct _Reduce_functor_helper
{
    const _Sub_function &_Sub_fun;
    const _Reduce_type &_Identity_value;

    mutable _Combinable_type &_Combinable;

    typedef _Reduce_type _Reduce_type;
    typedef typename _Combinable_type::_Bucket Bucket_type;

    _Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&comb):
        _Sub_fun(_Sub_fun), _Combinable(comb), _Identity_value(_Identity)
    {
    }

private:
    _Reduce_functor_helper &operator =(const _Reduce_functor_helper &other);
};

// All the code below is the worker without range stealing
template<typename _Forward_iterator, typename _Functor>
class _Parallel_reduce_fixed_worker
{
public:
    // The bucket allocation order will depend on the worker construction order
    _Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun):
        _M_begin(_Begin), _M_end(_End), _M_fun(_Fun), _M_bucket(_M_fun._Combinable._Unsafe_push_back())
    {
    }

    void operator ()() const
    {
        _M_bucket->_Put(_M_fun._Sub_fun(_M_begin, _M_end, _M_fun._Identity_value));
    }

private:
    const _Functor &_M_fun;
    const _Forward_iterator _M_begin, _M_end;
    typename _Functor::Bucket_type * const _M_bucket;
    _Parallel_reduce_fixed_worker &operator =(const _Parallel_reduce_fixed_worker &other);
};

// the parallel worker executor for fixed iterator
// it will divide fixed number of chunks
// almost same as fixed parallel for, except keep the chunk dividing order
template <typename _Worker, typename _Random_iterator, typename _Function>
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun)
{
    size_t _Cpu_num = static_cast<size_t>(Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors()), _Size = _End - _Begin;

    structured_task_group _Tg;
    Concurrency::details::_MallocaArrayHolder<task_handle<_Worker>> _Holder;
    task_handle<_Worker> *_Tasks = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker>) * (_Cpu_num - 1)));

    size_t _Begin_index = 0;
    size_t _Step = _Size / _Cpu_num;
    size_t _NumRemaining = _Size - _Step * _Cpu_num;

    for(size_t _I = 0; _I < _Cpu_num - 1; _I++)
    {
        size_t _Next = _Begin_index + _Step;

        // Add remaining to each chunk
        if (_NumRemaining)
        {
            --_NumRemaining;
            ++_Next;
        }

        // New up a task_handle "in-place", in the array preallocated on the stack
        new (_Tasks + _I) task_handle<_Worker>(_Worker(_Begin + _Begin_index, _Begin + _Next, _Fun));
        _Holder._IncrementConstructedElemsCount();

        // Run each of the chunk _Tasks in parallel
        _Tg.run(_Tasks[_I]);
        _Begin_index = _Next;
    }

    task_handle<_Worker> _Tail(_Worker(_Begin + _Begin_index, _End, _Fun));
    _Tg.run_and_wait(_Tail);
}

// The parallel worker executor for forward iterators
// Divide chunks on the fly
template <typename _Forward_iterator, typename _Function, int _Default_worker_size, int _Default_chunk_size>
struct _Parallel_reduce_forward_executor_helper
{
    typedef _Parallel_reduce_fixed_worker<_Forward_iterator, _Function> _Worker_class;
    mutable std::auto_ptr<task_handle<_Worker_class>> _Workers;
    int _Worker_size;

    _Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function& _Func):
        _Workers(static_cast<task_handle<_Worker_class> *>(Concurrency::Alloc(sizeof(task_handle<_Worker_class>) * _Default_worker_size)))
    {
        _Worker_size = 0;
        while (_Worker_size < _Default_worker_size && _First != _Last)
        {
            // Copy the range _Head
            _Forward_iterator _Head = _First;

            // Read from forward iterator
            for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
            {
                // Body is empty
            }

            // _First will be the end of current chunk
            new (_Workers.get() + _Worker_size++) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
        }
    }

    _Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other):
        _Workers(_Other._Workers), _Worker_size(_Other._Worker_size)
    {
    }

    void operator ()() const
    {
        structured_task_group _Tg;
        for(int _I = 0; _I < _Worker_size; _I++)
        {
            _Tg.run(_Workers.get()[_I]);
        }
        _Tg.wait();
    }

    ~_Parallel_reduce_forward_executor_helper()
    {
        if (_Workers.get())
        {
            for (int _I = 0; _I < _Worker_size; _I++)
            {
                _Workers.get()[_I].~task_handle<_Worker_class>();
            }
            Concurrency::Free(_Workers.release());
        }
    }
};

template <typename _Forward_iterator, typename _Function>
void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function& _Func, task_group& _Task_group)
{
    const static int _Internal_worker_number = 1024, _Default_chunk_size = 512;
    typedef _Parallel_reduce_fixed_worker<_Forward_iterator, _Function> _Worker_class;

    structured_task_group _Worker_group;
    Concurrency::details::_MallocaArrayHolder<task_handle<_Worker_class>> _Holder;
    task_handle<_Worker_class>* _Workers = _Holder._InitOnRawMalloca(_malloca(_Internal_worker_number * sizeof(task_handle<_Worker_class>)));

    // Start execution first
    int _Index = 0;
    while (_Index < _Internal_worker_number && _First != _Last)
    {
        // Copy the range _Head
        _Forward_iterator _Head = _First;

        // Read from forward iterator
        for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
        {
            // Body is empty
        };

        // Create a new task, _First is range _End
        new (_Workers + _Index) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
        _Holder._IncrementConstructedElemsCount();
        _Worker_group.run(_Workers[_Index]);
        ++_Index;
    }

    // Divide and append the left
    while (_First != _Last)
    {
        _Task_group.run(_Parallel_reduce_forward_executor_helper<_Forward_iterator, _Function, _Internal_worker_number, _Default_chunk_size>(_First, _Last, _Func));
    }

    _Worker_group.wait();
}

#pragma warning(pop)


// Disable C4180: qualifier applied to function type has no meaning; ignored
// Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
#pragma warning(push)
#pragma warning(disable: 4180)

//
// Dispatch the execution and handle the condition that all of the iterators are random access
//
template<typename _Any_input_traits, typename _Any_output_traits>
struct _Unary_transform_impl_helper
{
    template<typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
    static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator& _Result, const _Unary_operator& _Unary_op, const auto_partitioner&)
    {
        task_group _Tg;
        _Parallel_transform_unary_impl2(_Begin, _End, _Result, _Unary_op, _Tg);
        _Tg.wait();
    }
};

template<>
struct _Unary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag>
{
    template<typename _Random_input_iterator, typename _Random_output_iterator, typename _Unary_operator, typename _Partitioner>
    static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End,
        _Random_output_iterator& _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
    {
        if (_Begin < _End)
        {
            Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End - _Begin), static_cast<size_t>(1),
                [_Begin, &_Result, &_Unary_op](size_t _Index)
                {
                    _Result[_Index] = _Unary_op(_Begin[_Index]);
                },
                std::forward<_Partitioner>(_Part));
            _Result += _End - _Begin;
        }
    }
};

template<typename _Any_input_traits1, typename _Any_input_traits2, typename _Any_output_traits>
struct _Binary_transform_impl_helper
{

    template<typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
    static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2,
        _Output_iterator& _Result, const _Binary_operator& _Binary_op, const auto_partitioner&)
    {
        task_group _Tg;
        _Parallel_transform_binary_impl2(_Begin1, _End1, _Begin2, _Result, _Binary_op, _Tg);
        _Tg.wait();
    }
};

template<>
struct _Binary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag>
{
    template<typename _Random_input_iterator1, typename _Random_input_iterator2, typename _Random_output_iterator, typename _Binary_operator, typename _Partitioner>
    static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1,
        _Random_input_iterator2 _Begin2, _Random_output_iterator& _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
    {
        if (_Begin1 < _End1)
        {
            Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End1 - _Begin1), static_cast<size_t>(1),
                [_Begin1, _Begin2, &_Result, &_Binary_op](size_t _Index)
                {
                    _Result[_Index] = _Binary_op(_Begin1[_Index], _Begin2[_Index]);
                },
                std::forward<_Partitioner>(_Part));
            _Result += _End1 - _Begin1;
        }
    }
};

//
// The implementation for at least one of the iterator is forward iterator
//
template <typename _Forward_iterator, typename _Iterator_kind>
class _Iterator_helper
{
public:
    static const size_t _Size = 1024;
    typedef typename std::iterator_traits<_Forward_iterator>::value_type value_type;

    _Iterator_helper()
    {
        static_assert(!std::is_same<_Iterator_kind, std::input_iterator_tag>::value
            && !std::is_same<_Iterator_kind, std::output_iterator_tag>::value,
            "iterator can not be input_iterator or output_iterator.");
    }

    size_t _Populate(_Forward_iterator& _First, _Forward_iterator _Last)
    {
        size_t _Length = 0;
        static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");

        for (size_t _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
        {
            // We only support l-value here, so it's safe
            _M_element_array[_Length++] = &(*_First++);
        }

        return _Length;
    }

    void _Populate(_Forward_iterator& _First, size_t _Length)
    {
        for (size_t _Index=0; _Index < _Length; _Index++)
        {
            _M_element_array[_Index] = &(*_First++);
        }
    }

    void _Store(const value_type& _Elem, size_t _Index) const
    {
        *(_M_element_array[_Index]) = _Elem;
    }

    typename std::iterator_traits<_Forward_iterator>::reference _Load(size_t _Index) const
    {
        return *(_M_element_array[_Index]);
    }

private:
    typename std::iterator_traits<_Forward_iterator>::pointer _M_element_array[_Size];
};

template <typename _Random_iterator>
class _Iterator_helper<_Random_iterator, std::random_access_iterator_tag>
{
public:
    static const size_t _Size = 1024;
    typedef typename std::iterator_traits<_Random_iterator>::value_type value_type;

    _Iterator_helper()
    {
    }

    size_t _Populate(_Random_iterator& _First, _Random_iterator _Last)
    {
        typename std::iterator_traits<_Random_iterator>::difference_type _Range_size = _Last - _First;
        typename std::iterator_traits<_Random_iterator>::difference_type _Sized = _Size;
        _M_first = _First;

        if (_Range_size > _Sized)
        {
            _First += _Size;
            return _Size;
        }
        else
        {
            _First += _Range_size;
            return static_cast<size_t>(_Range_size);
        }
    }

    void _Populate(_Random_iterator& _First, size_t _Length)
    {
        _M_first = _First;
        _First += _Length;
    }

    void _Store(const value_type& _Elem, size_t _Index) const
    {
        _M_first[_Index] = _Elem;
    }

    typename std::iterator_traits<_Random_iterator>::reference _Load(size_t _Index) const
    {
        // We only support l-value here
        return _M_first[_Index];
    }

private:
    _Random_iterator _M_first;
};

template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
class _Parallel_transform_binary_helper
{
public:
    _Parallel_transform_binary_helper(_Input_iterator1& _First1, _Input_iterator1 _Last1, _Input_iterator2& _First2,
        _Output_iterator& _Result, const _Binary_operator& _Binary_op) :
        _M_binary_op(_Binary_op), _M_len(0)
        {
            _M_len = _M_input_helper1._Populate(_First1, _Last1);
            _M_input_helper2._Populate(_First2, _M_len);
            _M_output_helper._Populate(_Result, _M_len);
        }

        void operator()() const
        {
            // Invoke parallel_for on the batched up array of elements
            Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
                [this] (size_t _Index)
                {
                    _M_output_helper._Store(_M_binary_op(_M_input_helper1._Load(_Index), _M_input_helper2._Load(_Index)), _Index);
                });
        }

private:

    _Iterator_helper<_Input_iterator1, typename std::iterator_traits<_Input_iterator1>::iterator_category>   _M_input_helper1;
    _Iterator_helper<_Input_iterator2, typename std::iterator_traits<_Input_iterator2>::iterator_category>   _M_input_helper2;
    _Iterator_helper<_Output_iterator, typename std::iterator_traits<_Output_iterator>::iterator_category>   _M_output_helper;
    const _Binary_operator&                                                                                  _M_binary_op;
    size_t                                                                                                   _M_len;

    _Parallel_transform_binary_helper const & operator=(_Parallel_transform_binary_helper const&);    // no assignment operator
};

template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result,
    const _Binary_operator& _Binary_op, task_group& _Tg)
{
    // This functor will be copied on the heap and will execute the chunk in parallel
    {
        _Parallel_transform_binary_helper<_Input_iterator1, _Input_iterator2, _Output_iterator, _Binary_operator> functor(_First1, _Last1, _First2, _Result, _Binary_op);
        _Tg.run(functor);
    }

    // If there is a tail, push the tail
    if (_First1 != _Last1)
    {
        _Tg.run(
            [=, &_Result, &_Binary_op, &_Tg]
            {
                _Parallel_transform_binary_impl2(_First1, _Last1, _First2, _Result, _Binary_op, _Tg);
            });
    }
}

template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
class _Parallel_transform_unary_helper
{
public:
    _Parallel_transform_unary_helper(_Input_iterator& _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator& _Unary_op) :
        _M_unary_op(_Unary_op), _M_len(0)
        {
            _M_len = _M_input_helper._Populate(_First, _Last);
            _M_output_helper._Populate(_Result, _M_len);
        }

        void operator()() const
        {
            // Invoke parallel_for on the batched up array of elements
            Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
                [this] (size_t _Index)
                {
                    _M_output_helper._Store(_M_unary_op(_M_input_helper._Load(_Index)), _Index);
                });
        }

private:

    _Iterator_helper<_Input_iterator, typename std::iterator_traits<_Input_iterator>::iterator_category>   _M_input_helper;
    _Iterator_helper<_Output_iterator, typename std::iterator_traits<_Output_iterator>::iterator_category> _M_output_helper;
    const _Unary_operator&                                                                                     _M_unary_op;
    size_t                                                                                              _M_len;

    _Parallel_transform_unary_helper const & operator=(_Parallel_transform_unary_helper const&);    // no assignment operator
};

template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result,
    const _Unary_operator& _Unary_op, task_group& _Tg)
{
    // This functor will be copied on the heap and will execute the chunk in parallel
    {
        _Parallel_transform_unary_helper<_Input_iterator, _Output_iterator, _Unary_operator> functor(_First, _Last, _Result, _Unary_op);
        _Tg.run(functor);
    }

    // If there is a tail, push the tail
    if (_First != _Last)
    {
        _Tg.run(
            [=, &_Result, &_Unary_op, &_Tg]
            {
                _Parallel_transform_unary_impl2(_First, _Last, _Result, _Unary_op, _Tg);
            });
    }
}

template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator, typename _Partitioner>
_Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
{
    typedef typename std::iterator_traits<_Input_iterator>::iterator_category _Input_iterator_type;
    typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;

    if (_First != _Last)
    {
        _Unary_transform_impl_helper<_Input_iterator_type, _Output_iterator_type>
            ::_Parallel_transform_unary_impl(_First, _Last, _Result, _Unary_op, std::forward<_Partitioner>(_Part));
    }

    return _Result;
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Unary_operator">
///     The type of the unary functor to be executed on each element in the input range.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Unary_op">
///     A user-defined unary function object that is applied to each element in the source range.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const auto_partitioner& _Part = auto_partitioner())
{
    return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Unary_operator">
///     The type of the unary functor to be executed on each element in the input range.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Unary_op">
///     A user-defined unary function object that is applied to each element in the source range.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const static_partitioner& _Part)
{
    return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Unary_operator">
///     The type of the unary functor to be executed on each element in the input range.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Unary_op">
///     A user-defined unary function object that is applied to each element in the source range.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const simple_partitioner& _Part)
{
    return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Unary_operator">
///     The type of the unary functor to be executed on each element in the input range.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Unary_op">
///     A user-defined unary function object that is applied to each element in the source range.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, affinity_partitioner& _Part)
{
    return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Input_iterator2">
///     The type of second input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Binary_operator">
///     The type of the binary functor executed pairwise on elements from the two source ranges.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_First2">
///     An input iterator addressing the position of the first element in the second source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Binary_op">
///     A user-defined binary function object that is applied pairwise, in a forward order, to the two source ranges.
/// </param>
/// <param name="_Part">
///     A reference to the partitioner object. The argument can be one of
///     <c>const</c> <see ref="auto_partitioner Class">auto_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="static_partitioner Class">static_partitioner</see><c>&amp;</c>,
///     <c>const</c> <see ref="simple_partitioner Class">simple_partitioner</see><c>&amp;</c> or
///     <see ref="affinity_partitioner Class">affinity_partitioner</see><c>&amp;</c>
///     If an <see ref="affinity_partitioner Class">affinity_partitioner</see> object is used, the reference must be a non-const l-value reference,
///     so that the algorithm can store state for future loops to re-use.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator, typename _Partitioner>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
    _Output_iterator _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
{
    typedef typename std::iterator_traits<_Input_iterator1>::iterator_category _Input_iterator_type1;
    typedef typename std::iterator_traits<_Input_iterator2>::iterator_category _Input_iterator_type2;
    typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;

    if (_First1 != _Last1)
    {
        _Binary_transform_impl_helper<_Input_iterator_type1, _Input_iterator_type2, _Output_iterator_type>
            ::_Parallel_transform_binary_impl(_First1, _Last1, _First2, _Result, _Binary_op, std::forward<_Partitioner>(_Part));
    }

    return _Result;
}

/// <summary>
///     Applies a specified function object to each element in a source range, or to a pair of elements from two source ranges,
///     and copies the return values of the function object into a destination range, in parallel. This functional is semantically
///     equivalent to <c>std::transform</c>.
/// </summary>
/// <typeparam name="_Input_iterator1">
///     The type of the first or only input iterator.
/// </typeparam>
/// <typeparam name="_Input_iterator2">
///     The type of second input iterator.
/// </typeparam>
/// <typeparam name="_Output_iterator">
///     The type of the output iterator.
/// </typeparam>
/// <typeparam name="_Binary_operator">
///     The type of the binary functor executed pairwise on elements from the two source ranges.
/// </typeparam>
/// <param name="_First1">
///     An input iterator addressing the position of the first element in the first or only source range to be operated on.
/// </param>
/// <param name="_Last1">
///     An input iterator addressing the position one past the final element in the first or only source range to be operated on.
/// </param>
/// <param name="_First2">
///     An input iterator addressing the position of the first element in the second source range to be operated on.
/// </param>
/// <param name="_Result">
///     An output iterator addressing the position of the first element in the destination range.
/// </param>
/// <param name="_Binary_op">
///     A user-defined binary function object that is applied pairwise, in a forward order, to the two source ranges.
/// </param>
/// <returns>
///     An output iterator addressing the position one past the final element in the destination range that is receiving the output elements
///     transformed by the function object.
/// </returns>
/// <remarks>
///     <see ref="auto_partitioner Class">auto_partitioner</see> will be used for the overloads without an explicit partitioner argument.
///     <para>For iterators that do not support random access, only <see ref="auto_partitioner Class">auto_partitioner</see> is supported.</para>
///     <para>The overloads that take the argument <paramref name="_Unary_op"/> transform the input range into the output range by applying
///     the unary functor to each element in the input range. <paramref name="_Unary_op"/> must support the function call operator with signature
///     <c>operator()(T)</c> where <c>T</c> is the value type of the range being iterated over.</para>
///     <para>The overloads that take the argument <paramref name="_Binary_op"/> transform two input ranges into the output range by applying the
///     binary functor to one element from the first input range and one element from the second input range. <paramref name="_Binary_op"/> must support
///     the function call operator with signature <c>operator()(T, U)</c> where <c>T</c>, <c>U</c> are value types of the two input iterators.</para>
///     <para>For more information, see <see cref="Parallel Algorithms"/>.</para>
/// </remarks>
/**/
template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
    _Output_iterator _Result, const _Binary_operator& _Binary_op)
{
    return parallel_transform(_First1, _Last1, _First2, _Result, _Binary_op, auto_partitioner());
}

#pragma warning(pop)

#pragma warning(push)
// object allocated on the heap may not be aligned 64
#pragma warning(disable: 4316)

/// <summary>
///     The <c>combinable&lt;T&gt;</c> object is intended to provide thread-private copies of data, to perform lock-free
///     thread-local sub-computations during parallel algorithms. At the end of the parallel operation, the
///     thread-private sub-computations can then be merged into a final result. This class can be used instead of
///     a shared variable, and can result in a performance improvement if there would otherwise be a lot of
///     contention on that shared variable.
/// </summary>
/// <typeparam name="_Ty">
///     The data type of the final merged result. The type must have a copy constructor and a default constructor.
/// </typeparam>
/// <remarks>
///     For more information, see <see cref="Parallel Containers and Objects"/>.
/// </remarks>
/**/
template<typename _Ty>
class combinable
{
private:

// Disable warning C4324: structure was padded due to __declspec(align())
// This padding is expected and necessary.
#pragma warning(push)
#pragma warning(disable: 4324)
    __declspec(align(64))
    struct _Node
    {
        unsigned long _M_key;
        _Ty _M_value;
        _Node* _M_chain;

        _Node(unsigned long _Key, _Ty _InitialValue)
            : _M_key(_Key), _M_value(_InitialValue), _M_chain(NULL)
        {
        }
    };
#pragma warning(pop)

    static _Ty _DefaultInit()
    {
        return _Ty();
    }

public:
    /// <summary>
    ///     Constructs a new <c>combinable</c> object.
    /// </summary>
    /// <remarks>
    ///     <para>The first constructor initializes new elements with the default constructor for the type <paramref name="_Ty"/>.</para>
    ///     <para>The second constructor initializes new elements using the initialization functor supplied as the
    ///           <paramref name="_FnInitialize"/> parameter.</para>
    ///     <para>The third constructor is the copy constructor.</para>
    /// </remarks>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    combinable()
        : _M_fnInitialize(_DefaultInit)
    {
        _InitNew();
    }

    /// <summary>
    ///     Constructs a new <c>combinable</c> object.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the initialization functor object.
    /// </typeparam>
    /// <param name="_FnInitialize">
    ///     A function which will be called to initialize each new thread-private value of the type <paramref name="_Ty"/>.
    ///     It must support a function call operator with the signature <c>_Ty ()</c>.
    /// </param>
    /// <remarks>
    ///     <para>The first constructor initializes new elements with the default constructor for the type <paramref name="_Ty"/>.</para>
    ///     <para>The second constructor initializes new elements using the initialization functor supplied as the
    ///           <paramref name="_FnInitialize"/> parameter.</para>
    ///     <para>The third constructor is the copy constructor.</para>
    /// </remarks>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    template <typename _Function>
    explicit combinable(_Function _FnInitialize)
        : _M_fnInitialize(_FnInitialize)
    {
        _InitNew();
    }

    /// <summary>
    ///     Constructs a new <c>combinable</c> object.
    /// </summary>
    /// <param name="_Copy">
    ///     An existing <c>combinable</c> object to be copied into this one.
    /// </param>
    /// <remarks>
    ///     <para>The first constructor initializes new elements with the default constructor for the type <paramref name="_Ty"/>.</para>
    ///     <para>The second constructor initializes new elements using the initialization functor supplied as the
    ///           <paramref name="_FnInitialize"/> parameter.</para>
    ///     <para>The third constructor is the copy constructor.</para>
    /// </remarks>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    combinable(const combinable& _Copy)
        : _M_size(_Copy._M_size), _M_fnInitialize(_Copy._M_fnInitialize)
    {
        _InitCopy(_Copy);
    }

    /// <summary>
    ///     Assigns to a <c>combinable</c> object from another <c>combinable</c> object.
    /// </summary>
    /// <param name="_Copy">
    ///     An existing <c>combinable</c> object to be copied into this one.
    /// </param>
    /// <returns>
    ///     A reference to this <c>combinable</c> object.
    /// </returns>
    /**/
    combinable& operator=(const combinable& _Copy)
    {
        clear();
        delete [] _M_buckets;
        _M_fnInitialize = _Copy._M_fnInitialize;
        _M_size = _Copy._M_size;
        _InitCopy(_Copy);

        return *this;
    }

    /// <summary>
    ///     Destroys a <c>combinable</c> object.
    /// </summary>
    /**/
    ~combinable()
    {
        clear();
        delete [] _M_buckets;
    }

    /// <summary>
    ///     Returns a reference to the thread-private sub-computation.
    /// </summary>
    /// <returns>
    ///     A reference to the thread-private sub-computation.
    /// </returns>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    _Ty& local()
    {
        unsigned long _Key = Concurrency::details::platform::GetCurrentThreadId();
        size_t _Index;
        _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
        if (_ExistingNode == NULL)
        {
            _ExistingNode = _AddLocalItem(_Key, _Index);
        }

        _CONCRT_ASSERT(_ExistingNode != NULL);
        return _ExistingNode->_M_value;
    }

    /// <summary>
    ///     Returns a reference to the thread-private sub-computation.
    /// </summary>
    /// <param name="_Exists">
    ///     A reference to a boolean. The boolean value referenced by this argument will be
    ///     set to <c>true</c> if the sub-computation already existed on this thread, and set to
    ///     <c>false</c> if this was the first sub-computation on this thread.
    /// </param>
    /// <returns>
    ///     A reference to the thread-private sub-computation.
    /// </returns>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    _Ty& local(bool& _Exists)
    {
        unsigned long _Key = Concurrency::details::platform::GetCurrentThreadId();
        size_t _Index;
        _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
        if (_ExistingNode == NULL)
        {
            _Exists = false;
            _ExistingNode = _AddLocalItem(_Key, _Index);
        }
        else
        {
            _Exists = true;
        }

        _CONCRT_ASSERT(_ExistingNode != NULL);
        return _ExistingNode->_M_value;
    }

    /// <summary>
    ///     Clears any intermediate computational results from a previous usage.
    /// </summary>
    /**/
    void clear()
    {
        for (size_t _Index = 0; _Index < _M_size; ++_Index)
        {
            _Node* _CurrentNode = _M_buckets[_Index];
            while (_CurrentNode != NULL)
            {
                _Node* _NextNode = _CurrentNode->_M_chain;
                delete _CurrentNode;
                _CurrentNode = _NextNode;
            }
        }
        memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
    }

    /// <summary>
    ///     Computes a final value from the set of thread-local sub-computations by calling the supplied combine functor.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to combine two thread-local sub-computations.
    /// </typeparam>
    /// <param name="_FnCombine">
    ///     The functor that is used to combine the sub-computations. Its signature is <c>T (T, T)</c> or
    ///     <c>T (const T&amp;, const T&amp;)</c>, and it must be associative and commutative.
    /// </param>
    /// <returns>
    ///     The final result of combining all the thread-private sub-computations.
    /// </returns>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    template<typename _Function>
    _Ty combine(_Function _FnCombine) const
    {
        _Node* _CurrentNode = NULL;
        size_t _Index;

        // Look for the first value in the set, and use (a copy of) that as the result.
        // This eliminates a single call (of unknown cost) to _M_fnInitialize.
        for (_Index = 0; _Index < _M_size; ++_Index)
        {
            _CurrentNode = _M_buckets[_Index];
            if (_CurrentNode != NULL)
            {
                 break;
            }
        }

        // No values... return the initializer value.
        if (_CurrentNode == NULL)
        {
            return _M_fnInitialize();
        }

        // Accumulate the rest of the items in the current bucket.
        _Ty _Result = _CurrentNode->_M_value;
        for (_CurrentNode = _CurrentNode->_M_chain; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
        {
            _Result = _FnCombine(_Result, _CurrentNode->_M_value);
        }

        // Accumulate values from the rest of the buckets.
        _CONCRT_ASSERT(_Index < _M_size);
        for (++_Index; _Index < _M_size; ++_Index)
        {
            for (_CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
            {
                _Result = _FnCombine(_Result, _CurrentNode->_M_value);
            }
        }

        return _Result;
    }

    /// <summary>
    ///     Computes a final value from the set of thread-local sub-computations by calling the supplied combine functor
    ///     once per thread-local sub-computation. The final result is accumulated by the function object.
    /// </summary>
    /// <typeparam name="_Function">
    ///     The type of the function object that will be invoked to combine a single thread-local sub-computation.
    /// </typeparam>
    /// <param name="_FnCombine">
    ///     The functor that is used to combine one sub-computation. Its signature is <c>void (T)</c> or
    ///     <c>void (const T&amp;)</c>, and must be associative and commutative.
    /// </param>
    /// <seealso cref="Parallel Containers and Objects"/>
    /**/
    template<typename _Function>
    void combine_each(_Function _FnCombine) const
    {
        for (size_t _Index = 0; _Index < _M_size; ++_Index)
        {
            for (_Node* _CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
            {
                _FnCombine(_CurrentNode->_M_value);
            }
        }
    }

private:
    void _InitNew()
    {
        _M_size = Concurrency::details::_GetCombinableSize();
        _M_buckets = new _Node*[_M_size];
        memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
    }

    void _InitCopy(const combinable& _Copy)
    {
        _M_buckets = new _Node*[_M_size];
        for (size_t _Index = 0; _Index < _M_size; ++_Index)
        {
            _M_buckets[_Index] = NULL;
            for (_Node* _CurrentNode = _Copy._M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
            {
                _Node* _NewNode = new _Node(_CurrentNode->_M_key, _CurrentNode->_M_value);
                _NewNode->_M_chain = _M_buckets[_Index];
                _M_buckets[_Index] = _NewNode;
            }
        }
    }

    _Node* _FindLocalItem(unsigned long _Key, size_t* _PIndex)
    {
        _CONCRT_ASSERT(_PIndex != NULL);

        *_PIndex = _Key % _M_size;

        // Search at this index for an existing value.
        _Node* _CurrentNode = _M_buckets[*_PIndex];
        while (_CurrentNode != NULL)
        {
            if (_CurrentNode->_M_key == _Key)
            {
                return _CurrentNode;
            }

            _CurrentNode = _CurrentNode->_M_chain;
        }

        return NULL;
    }

    _Node* _AddLocalItem(unsigned long _Key, size_t _Index)
    {
        _Node* _NewNode = new _Node(_Key, _M_fnInitialize());
        _Node* _TopNode;
        do
        {
            _TopNode = _M_buckets[_Index];
            _NewNode->_M_chain = _TopNode;
        } while (_InterlockedCompareExchangePointer(reinterpret_cast<void * volatile *>(&_M_buckets[_Index]), _NewNode, _TopNode) != _TopNode);

        return _NewNode;
    }

private:
    _Node *volatile * _M_buckets;
    size_t _M_size;
    std::tr1::function<_Ty ()> _M_fnInitialize;
};

#pragma warning(pop) // C4316

#pragma push_macro("_MAX_NUM_TASKS_PER_CORE")
#pragma push_macro("_FINE_GRAIN_CHUNK_SIZE")
#pragma push_macro("_SORT_MAX_RECURSION_DEPTH")

// This number is used to control dynamic task splitting
// The ideal chunk (task) division is that the number of cores is equal to the number of tasks, but it will
// perform very poorly when tasks are not balanced. The simple solution is to allocate more tasks than number
// of cores. _MAX_NUM_TASKS_PER_CORE provides a maximum number of tasks that will be allocated per core.
// If this number is too small, the load balancing problem will affect efficiency very seriously, especially
// when the compare operation is expensive.
//
// Note that this number is a maximum number -- the dynamic partition system will reduce the number of partitions
// per core based on the dynamic load. If all cores are very busy, the number of partitions will shrink to
// reduce the scheduler overhead.
//
// Initially, the total tasks(chunks) number of partitions "_Div_num" will be: core number * _MAX_NUM_TASKS_PER_CORE.
// The _Div_num will be divided by 2 after each task splitting. There are two special numbers for _Div_num:
//     1. When _Div_num reaches the point that _Div_num < _MAX_NUM_TASKS_PER_CORE, it means we have split more tasks than cores.
//     2. When _Div_num reaches the point that _Div_num <= 1, it means stop splitting more tasks and begin sorting serially.
#define _MAX_NUM_TASKS_PER_CORE 1024

// This is a number mainly is used to control the sampling and dynamic task splitting strategies.
// If the user configurable minimal divisible chunk size (default is 2048) is smaller than FINE_GRAIN_CHUNK_SIZE,
// the random sampling algorithm for quicksort will enter fine-grained mode, and take a strategy that reduces the sampling
// overhead. Also, the dynamic task splitting will enter fine-grained mode, which will split as many tasks as possible.
#define _FINE_GRAIN_CHUNK_SIZE 512

// This is the maximum depth that the quicksort will be called recursively. If we allow too far, a stack overflow may occur.
#define _SORT_MAX_RECURSION_DEPTH 64

template<typename _Random_iterator, typename _Function>
inline size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
{
    _Potentially_equal = false;
    if (_Func(_Begin[_A], _Begin[_B]))
    {
        if (_Func(_Begin[_A], _Begin[_C]))
        {
            return _Func(_Begin[_B], _Begin[_C]) ? _B : _C;
        }
        else
        {
            return _A;
        }
    }
    else
    {
        if (_Func(_Begin[_B], _Begin[_C]))
        {
            return _Func(_Begin[_A], _Begin[_C]) ? _A : _C;
        }
        else
        {
            _Potentially_equal = true;
            return _B;
        }
    }
}

template<typename _Random_iterator, typename _Function>
inline size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
{
    size_t _Offset = _Size / 8;
    size_t _A = _Median_of_three(_Begin, 0, _Offset, _Offset * 2, _Func, _Potentially_equal),
        _B = _Median_of_three(_Begin, _Offset * 3, _Offset * 4, _Offset * 5, _Func, _Potentially_equal),
        _C = _Median_of_three(_Begin, _Offset * 6, _Offset * 7, _Size - 1, _Func, _Potentially_equal);
    _B = _Median_of_three(_Begin, _A, _B, _C, _Func, _Potentially_equal);

    if (_Potentially_equal)
    {
        _Potentially_equal = !_Func(_Begin[_C], _Begin[_A]);
    }

    return _B;
}

// _Potentially_equal means that potentially all the values in the buffer are equal to the pivot value
template<typename _Random_iterator, typename _Function>
inline size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
{
    // Base on different chunk size, apply different sampling optimization
    if (_Chunk_size < _FINE_GRAIN_CHUNK_SIZE && _Size <= std::max<size_t>(_Chunk_size * 4, static_cast<size_t>(15)))
    {
        bool _Never_care_equal;
        return _Median_of_three(_Begin, 0, _Size / 2, _Size - 1, _Func, _Never_care_equal);
    }
    else
    {
        return _Median_of_nine(_Begin, _Size, _Func, _Potentially_equal);
    }
}

// Find out two middle points for two sorted arrays by binary search so that the number of total elements on the left part of two middle points is equal
// to the number of total elements on the right part of two sorted arrays and all elements on the left part is smaller than right part.
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
{
    size_t _Len = (_Len1 + _Len2) / 2, _Index1 = 0, _Index2 = 0;

    while (_Index1 < _Len1 && _Index2 < _Len2)
    {
        size_t _Mid1 = (_Index1 + _Len1) / 2, _Mid2 = (_Index2 + _Len2) / 2;
        if (_Func(_Begin1[_Mid1], _Begin2[_Mid2]))
        {
            if (_Mid1 + _Mid2 < _Len)
            {
                _Index1 = _Mid1 + 1;
            }
            else
            {
                _Len2 = _Mid2;
            }
        }
        else
        {
            if (_Mid1 + _Mid2 < _Len)
            {
                _Index2 = _Mid2 + 1;
            }
            else
            {
                _Len1 = _Mid1;
            }
        }
    }

    if (_Index1 == _Len1)
    {
        _Len2 = _Len - _Len1;
    }
    else
    {
        _Len1 = _Len - _Len2;
    }

    return _Len;
}

// "move" operation is applied between buffers
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2,
    _Random_output_iterator _Output, const _Function &_Func)
{
    while (_Begin1 != _End1 && _Begin2 != _End2)
    {
        if (_Func(*_Begin1, *_Begin2))
        {
            *_Output++ = std::move(*_Begin1++);
        }
        else
        {
            *_Output++ = std::move(*_Begin2++);
        }
    }

    if (_Begin1 != _End1)
    {
        std::_Move(_Begin1, _End1, _Output);
    }
    else if (_Begin2 != _End2)
    {
        std::_Move(_Begin2, _End2, _Output);
    }
}

// _Div_num of threads(tasks) merge two chunks in parallel, _Div_num should be power of 2, if not, the largest power of 2 that is
// smaller than _Div_num will be used
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output,
    const _Function &_Func, size_t _Div_num)
{
    // Turn to serial merge or continue splitting chunks base on "_Div_num"
    if (_Div_num <= 1 || (_Len1 <= 1 && _Len2 <= 1))
    {
        _Merge_chunks(_Begin1, _Begin1 + _Len1, _Begin2, _Begin2 + _Len2, _Output, _Func);
    }
    else
    {
        size_t _Mid_len1 = _Len1, _Mid_len2 = _Len2;
        size_t _Mid = _Search_mid_point(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Func);

        structured_task_group _Tg;
        auto _Handle = make_task([&]
        {
            _Parallel_merge(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Output, _Func, _Div_num / 2);
        });
        _Tg.run(_Handle);

        _Parallel_merge(_Begin1 + _Mid_len1, _Len1 - _Mid_len1, _Begin2 + _Mid_len2, _Len2 - _Mid_len2, _Output + _Mid, _Func, _Div_num / 2);

        _Tg.wait();
    }
}

// Return current sorting byte from key
template<typename _Ty, typename _Function>
inline size_t _Radix_key(const _Ty& _Val, size_t _Radix, _Function _Proj_func)
{
    return static_cast<size_t>(_Proj_func(_Val) >> static_cast<int>(8 * _Radix) & 255);
}

// One pass of radix sort
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
{
    if (!_Size)
    {
        return;
    }

    size_t _Pos[256] = {0};

    for (size_t _I = 0; _I < _Size; _I++)
    {
        ++_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)];
    }

    for (size_t _I = 1; _I < 256; _I++)
    {
        _Pos[_I] += _Pos[_I - 1];
    }

    // _Size > 0
    for (size_t _I = _Size - 1; _I != 0; _I--)
    {
        _Output[--_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)]] = std::move(_Begin[_I]);
    }

    _Output[--_Pos[_Radix_key(_Begin[0], _Radix, _Proj_func)]] = std::move(_Begin[0]);
}

// Serial least-significant-byte radix sort, it will sort base on last "_Radix" number of bytes
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
    size_t _Radix, _Function _Proj_func, size_t _Deep = 0)
{
    size_t _Cur_radix = 0;
    if (_Size == 0)
    {
        return;
    }

    while (_Cur_radix < _Radix)
    {
        _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
        _Integer_radix_pass(_Output, _Size, _Begin, _Cur_radix++, _Proj_func);
    }

    if (_Cur_radix == _Radix)
    {
        _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
    }

    // if odd round is passed, then move result back to input buffer
    if (_Deep + _Radix + 1 & 1)
    {
        if (_Radix + 1 & 1)
        {
            std::_Move(_Output, _Output + _Size, _Begin);
        }
        else
        {
            std::_Move(_Begin, _Begin + _Size, _Output);
        }
    }
}

// Parallel most-significant-byte _Radix sort.
// In the end, it will turn to serial least-significant-byte radix sort
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
    size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep = 0)
{
    // If the chunk _Size is too small, then turn to serial least-significant-byte radix sort
    if (_Size <= _Chunk_size || _Radix < 1)
    {
        return _Integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Deep);
    }

    size_t _Threads_num = Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();
    size_t _Buffer_size = sizeof(size_t) * 256 * _Threads_num;
    size_t _Step = _Size / _Threads_num;
    size_t _Remain = _Size % _Threads_num;

    Concurrency::details::_MallocaArrayHolder<size_t [256]> _Holder;
    size_t (*_Chunks)[256] = _Holder._InitOnRawMalloca(_malloca(_Buffer_size));

    memset(_Chunks, 0, _Buffer_size);

    // Our purpose is to map unsorted data in buffer "_Begin" to buffer "_Output" so that all elements who have the same
    // byte value in the "_Radix" position will be grouped together in the buffer "_Output"
    //
    // Serial version:
    // To understand this algorithm, first consider a serial version. In following example, we treat 1 digit as 1 byte, so we have a
    // total of 10 elements for each digit instead of 256 elements in each byte. Let's suppose "_Radix" == 1 (right most is 0), and:
    //
    //      begin:  [ 32 | 62 | 21 | 43 | 55 | 43 | 23 | 44 ]
    //
    // We want to divide the output buffer "_Output" into 10 chunks, and each the element in the "_Begin" buffer should be mapped into
    // the proper destination chunk based on its current digit (byte) indicated by "_Radix"
    //
    // Because "_Radix" == 1, after a pass of this function, the chunks in the "_Output" should look like:
    //
    //      buffer: [   |   | 21 23 | 32 | 43 43 44 | 55 | 62 |   |   |   ]
    //                0   1     2      3      4        5    6   7   8   9
    //
    // The difficulty is determining where to insert values into the "_Output" to get the above result. The way to get the
    // start position of each chunk of the buffer is:
    //      1. Count the number of elements for each chunk (in above example, chunk0 is 0, chunk1 is 0, chunk2 is 2, chunk3 is 1 ...
    //      2. Make a partial sum for these chunks( in above example,  we will get chunk0=chunk0=0, chunk1=chunk0+chunk1=0,
    //         chunk2=chunk0+chunk1+chunk2=2, chunk3=chunk0+chunk1+chunk2+chunk3=3
    //
    // After these steps, we will get the end position of each chunk in the "_Output". The begin position of each chunk will be the end
    // point of last chunk (begin point is close but the end point is open). After that,  we can scan the original array again and directly
    // put elements from original buffer "_Begin" into specified chunk on buffer "_Output".
    // Finally, we invoke _parallel_integer_radix_sort in parallel for each chunk and sort them in parallel based on the next digit (byte).
    // Because this is a STABLE sort algorithm, if two numbers has same key value on this byte (digit), their original order should be kept.
    //
    // Parallel version:
    // Almost the same as the serial version, the differences are:
    //      1. The count for each chunk is executed in parallel, and each thread will count one segment of the input buffer "_Begin".
    //         The count result will be separately stored in their own chunk size counting arrays so we have a total of threads-number
    //         of chunk count arrays.
    //         For example, we may have chunk00, chunk01, ..., chunk09 for first thread, chunk10, chunk11, ..., chunk19 for second thread, ...
    //      2. The partial sum should be executed across these chunk counting arrays that belong to different threads, instead of just
    //         making a partial sum in one counting array.
    //         This is because we need to put values from different segments into one final buffer, and the absolute buffer position for
    //         each chunkXX is needed.
    //      3. Make a parallel scan for original buffer again, and move numbers in parallel into the corresponding chunk on each buffer based
    //         on these threads' chunk size counters.

    // Count in parallel and separately save their local results without reducing
    Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
    {
        size_t _Beg_index, _End_index;

        // Calculate the segment position
        if (_Index < _Remain)
        {
            _Beg_index = _Index * (_Step + 1);
            _End_index = _Beg_index + (_Step + 1);
        }
        else
        {
            _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
            _End_index = _Beg_index + _Step;
        }

        // Do a counting
        while (_Beg_index != _End_index)
        {
            ++_Chunks[_Index][_Radix_key(_Begin[_Beg_index++], _Radix, _Proj_func)];
        }
    });

    int _Index = -1, _Count = 0;

    // Partial sum cross different threads' chunk counters
    for (int _I = 0; _I < 256; _I++)
    {
        size_t _Last = _I ? _Chunks[_Threads_num - 1][_I - 1] : 0;
        _Chunks[0][_I] += _Last;

        for (size_t _J = 1; _J < _Threads_num; _J++)
        {
            _Chunks[_J][_I] += _Chunks[_J - 1][_I];
        }

        // "_Chunks[_Threads_num - 1][_I] - _Last" will get the global _Size for chunk _I(including all threads local _Size for chunk _I)
        // this will chunk whether the chunk _I is empty or not. If it's not empty, it will be recorded.
        if (_Chunks[_Threads_num - 1][_I] - _Last)
        {
            ++_Count;
            _Index = _I;
        }
    }

    // If there is more than 1 chunk that has content, then continue the original algorithm
    if (_Count > 1)
    {
        // Move the elements in parallel into each chunk
        Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
        {
            size_t _Beg_index, _End_index;

            // Calculate the segment position
            if (_Index < _Remain)
            {
                _Beg_index = _Index * (_Step + 1);
                _End_index = _Beg_index + (_Step + 1);
            }
            else
            {
                _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
                _End_index = _Beg_index + _Step;
            }

            // Do a move operation to directly put each value into its destination chunk
            // Chunk pointer is moved after each put operation.
            if (_Beg_index != _End_index--)
            {
                while (_Beg_index != _End_index)
                {
                    _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
                    --_End_index;
                }
                _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
            }
        });

        // Invoke _parallel_integer_radix_sort in parallel for each chunk
        Concurrency::parallel_for(static_cast<size_t>(0), static_cast<size_t>(256), [=](size_t _Index)
        {
            if (_Index < 256 - 1)
            {
                _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Chunks[0][_Index + 1] - _Chunks[0][_Index],
                    _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
            }
            else
            {
                _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Size - _Chunks[0][_Index],
                    _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
            }
        });
    }
    else
    {
        // Only one chunk has content
        // A special optimization is applied because one chunk means all numbers have a same value on this particular byte (digit).
        // Because we cannot sort them at all (they are all equal at this point), directly call _parallel_integer_radix_sort to
        // sort next byte (digit)
        _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix - 1, _Proj_func, _Chunk_size, _Deep);
    }
}

template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
    _Function _Proj_func, const size_t _Chunk_size)
{
    typedef typename std::iterator_traits<_Random_iterator>::value_type _Value_type;
    // The key type of the radix sort, this must be an "unsigned integer-like" type, that is, it needs support:
    //     operator>> (int), operator>>= (int), operator& (int), operator <, operator size_t ()
    typedef typename std::remove_const<typename std::remove_reference<decltype(_Proj_func(*_Begin))>::type>::type _Integer_type;

    // Find out the max value, which will be used to determine the highest differing byte (the radix position)
    _Integer_type _Max_val = Concurrency::parallel_reduce(_Begin, _Begin + _Size, _Proj_func(*_Begin),
        [=](_Random_iterator _Begin, _Random_iterator _End, _Integer_type _Init) -> _Integer_type
        {
            while (_Begin != _End)
            {
                _Integer_type _Ret = _Proj_func(*_Begin++);
                if (_Init < _Ret)
                {
                    _Init = _Ret;
                }
            }

            return _Init;
        }, [](const _Integer_type &a, const _Integer_type &b) -> const _Integer_type& {return (a < b)? b : a;});
    size_t _Radix = 0;

    // Find out highest differing byte
    while (_Max_val >>= 8)
    {
        ++_Radix;
    }

    _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Chunk_size);
}

template<typename _Random_iterator, typename _Function>
void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
{
    if (_Depth >= _SORT_MAX_RECURSION_DEPTH || _Size <= _Chunk_size || _Size <= static_cast<size_t>(3) || _Chunk_size >= _FINE_GRAIN_CHUNK_SIZE && _Div_num <= 1)
    {
        return std::sort(_Begin, _Begin + _Size, _Func);
    }

    // Determine whether we need to do a three-way quick sort
    // We benefit from three-way merge if there are a lot of elements that are EQUAL to the median value,
    // _Select_median_pivot function will test redundant density by sampling
    bool _Is_three_way_split = false;
    size_t _Mid_index = _Select_median_pivot(_Begin, _Size, _Func, _Chunk_size, _Is_three_way_split);

    // Move the median value to the _Begin position.
    if (_Mid_index)
    {
        std::swap(*_Begin, _Begin[_Mid_index]);
    }
    size_t _I = 1, _J = _Size - 1;

    // Three-way or two-way partition
    // _Div_num < _MAX_NUM_TASKS_PER_CORE is checked to make sure it will never do three-way split before splitting enough tasks
    if (_Is_three_way_split && _Div_num < _MAX_NUM_TASKS_PER_CORE)
    {
        while (_Func(*_Begin, _Begin[_J]))
        {
            --_J;
        }

        while (_Func(_Begin[_I], *_Begin))
        {
            ++_I;
        }

        // Starting from this point, left side of _I will less than median value, right side of _J will be greater than median value,
        // and the middle part will be equal to median. _K is used to scan between _I and _J
        size_t _K = _J;
        while (_I <= _K)
        {
            if (_Func(_Begin[_K], *_Begin))
            {
                std::swap(_Begin[_I++], _Begin[_K]);
            }
            else
            {
                --_K;
            }

            while (_Func(*_Begin, _Begin[_K]))
            {
                std::swap(_Begin[_K--], _Begin[_J--]);
            }
        }

        ++_J;
    }
    else
    {
        while (_I <= _J)
        {
            // Will stop before _Begin
            while (_Func(*_Begin, _Begin[_J]))
            {
                --_J;
            }

            // There must be another element equal or greater than *_Begin
            while (_Func(_Begin[_I], *_Begin))
            {
                ++_I;
            }

            if (_I < _J)
            {
                std::swap(_Begin[_I++], _Begin[_J--]);
            }
            else
            {
                break;
            }
        }

        _I = ++_J;
    }

    std::swap(*_Begin, _Begin[--_I]);

    structured_task_group _Tg;
    volatile size_t _Next_div = _Div_num / 2;
    auto _Handle = make_task([&]
    {
        _Parallel_quicksort_impl(_Begin + _J, _Size - _J, _Func, _Next_div, _Chunk_size, _Depth+1);
    });
    _Tg.run(_Handle);

    _Parallel_quicksort_impl(_Begin, _I, _Func, _Next_div, _Chunk_size, _Depth+1);

    // If at this point, the work hasn't been scheduled, then slow down creating new tasks
    if (_Div_num < _MAX_NUM_TASKS_PER_CORE)
    {
        _Next_div /= 2;
    }

    _Tg.wait();
}

// This function will be called to sort the elements in the "_Begin" buffer. However, we can't tell whether the result will end up in buffer
// "_Begin", or buffer "_Output" when it returned. The return value is designed to indicate which buffer holds the sorted result.
// Return true if the merge result is in the "_Begin" buffer; return false if the result is in the "_Output" buffer.
// We can't always put the result into one assigned buffer because that may cause frequent buffer copies at return time.
template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
inline bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func,
    int _Div_num, const size_t _Chunk_size)
{
    static_assert(std::is_same<typename std::iterator_traits<_Random_iterator>::value_type, typename std::iterator_traits<_Random_buffer_iterator>::value_type>::value,
        "same value type expected");

    if (_Div_num <= 1 || _Size <= _Chunk_size)
    {
        _Parallel_quicksort_impl(_Begin, _Size, _Func, _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);

        // In case _Size <= _Chunk_size happened BEFORE the planned stop time (when _Div_num == 1) we need to calculate how many turns of
        // binary divisions are left. If there are an odd number of turns left, then the buffer move is necessary to make sure the final
        // merge result will be in the original input array.
        int _Left_div_turns = 0;
        while (_Div_num >>= 1)
        {
            _Left_div_turns++;
        }

        if (_Left_div_turns & 1)
        {
            std::move(_Begin, _Begin + _Size, _Output);
            return true;
        }
        else
        {
            return false;
        }
    }
    else
    {
        size_t _Mid = _Size / 2;
        structured_task_group _Tg;

        auto _Handle = make_task([&, _Chunk_size]
        {
            _Parallel_buffered_sort_impl(_Begin, _Mid, _Output, _Func, _Div_num / 2, _Chunk_size);
        });
        _Tg.run(_Handle);

        bool _Is_buffer_swap = _Parallel_buffered_sort_impl(_Begin + _Mid, _Size - _Mid, _Output + _Mid, _Func, _Div_num / 2, _Chunk_size);

        _Tg.wait();

        if (_Is_buffer_swap)
        {
            _Parallel_merge(_Output, _Mid, _Output + _Mid, _Size - _Mid, _Begin, _Func, _Div_num);
        }
        else
        {
            _Parallel_merge(_Begin, _Mid, _Begin + _Mid, _Size - _Mid, _Output, _Func, _Div_num);
        }

        return !_Is_buffer_swap;
    }
}

// Disable the warning saying constant value in condition expression.
// This is by design that lets the compiler optimize the trivial constructor.
#pragma warning (push)
#pragma warning (disable: 4127)

// Allocate and construct a buffer
template<typename _Allocator>
inline typename _Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
{
    typename _Allocator::pointer _P = _Alloc.allocate(_N);

    // If the objects being sorted have trivial default constructors, they do not need to be
    // constructed here. This can benefit performance.
    if (!std::has_trivial_default_constructor<typename _Allocator::value_type>::value)
    {
        for (size_t _I = 0; _I < _N; _I++)
        {
            // Objects being sorted must have a default constructor
            typename _Allocator::value_type _T;
            _Alloc.construct(_P + _I, std::forward<typename _Allocator::value_type>(_T));
        }
    }

    return _P;
}

// Destroy and deallocate a buffer
template<typename _Allocator>
inline void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
{
    // If the objects being sorted have trivial default destructors, they do not need to be
    // destructed here. This can benefit performance.
    if (!std::has_trivial_destructor<typename _Allocator::value_type>::value)
    {
        for (size_t _I = 0; _I < _N; _I++)
        {
            _Alloc.destroy(_P + _I);
        }
    }

    _Alloc.deallocate(_P, _N);
}

//
// Exception safe RAII wrapper for the allocated buffers
//

template<typename _Allocator>
class _AllocatedBufferHolder
{
public:
    _AllocatedBufferHolder(size_t _Size, const _Allocator & _Alloc): _M_alloc(_Alloc)
    {
        _M_size = _Size;
        _M_buffer = _Construct_buffer(_Size, _M_alloc);
    }

    ~_AllocatedBufferHolder()
    {
        _Destroy_buffer(_M_buffer, _M_size, _M_alloc);
    }

    typename _Allocator::pointer _Get_buffer()
    {
        return _M_buffer;
    }

private:
    size_t _M_size;
    _Allocator _M_alloc;
    typename _Allocator::pointer _M_buffer;
};


#pragma warning (pop)

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     The first overload uses the the binary comparator <c>std::less</c>.
///     <para>The second overloaded uses the supplied binary comparator that should have the signature <c>bool _Func(T, T)</c> where <c>T</c>
///     is the type of the elements in the input range.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator>
inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    parallel_sort(_Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the binary comparison functor.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Func">
///     A user-defined predicate function object that defines the comparison criterion to be satisfied by successive elements in the ordering.
///     A binary predicate takes two arguments and returns <c>true</c> when satisfied and <c>false</c> when not satisfied. This comparator function
///     must impose a strict weak ordering on pairs of elements from the sequence.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     The first overload uses the the binary comparator <c>std::less</c>.
///     <para>The second overloaded uses the supplied binary comparator that should have the signature <c>bool _Func(T, T)</c> where <c>T</c>
///     is the type of the elements in the input range.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator, typename _Function>
inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
{
    _CONCRT_ASSERT(_Chunk_size > 0);

    // Check for cancellation before the algorithm starts.
    interruption_point();

    size_t _Size = _End - _Begin;
    size_t _Core_num = Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();

    if (_Size <= _Chunk_size || _Core_num < 2)
    {
        return std::sort(_Begin, _End, _Func);
    }

    _Parallel_quicksort_impl(_Begin, _Size, _Func, _Core_num * _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator>
inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End,
        std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator>
inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    parallel_buffered_sort<_Allocator>(_Begin, _End,
        std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Alloc">
///     An instance of an STL compatible memory allocator.
/// </param>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator>
inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    parallel_buffered_sort<_Allocator>(_Alloc, _Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the binary comparator.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Func">
///     A user-defined predicate function object that defines the comparison criterion to be satisfied by successive elements in the ordering.
///     A binary predicate takes two arguments and returns <c>true</c> when satisfied and <c>false</c> when not satisfied. This comparator function
///     must impose a strict weak ordering on pairs of elements from the sequence.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator, typename _Function>
inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
{
    parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End, _Func, _Chunk_size);
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the binary comparator.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Func">
///     A user-defined predicate function object that defines the comparison criterion to be satisfied by successive elements in the ordering.
///     A binary predicate takes two arguments and returns <c>true</c> when satisfied and <c>false</c> when not satisfied. This comparator function
///     must impose a strict weak ordering on pairs of elements from the sequence.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator, typename _Function>
inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
{
    _Allocator _Alloc;
    return parallel_buffered_sort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Func, _Chunk_size);
}

/// <summary>
///     Arranges the elements in a specified range into a nondescending order, or according to an ordering criterion specified by a binary predicate,
///     in parallel. This function is semantically similar to <c>std::sort</c> in that it is a compare-based, unstable, in-place sort except that
///     it needs <c>O(n)</c> additional space, and requires default initialization for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the binary comparator.
/// </typeparam>
/// <param name="_Alloc">
///     An instance of an STL compatible memory allocator.
/// </param>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Func">
///     A user-defined predicate function object that defines the comparison criterion to be satisfied by successive elements in the ordering.
///     A binary predicate takes two arguments and returns <c>true</c> when satisfied and <c>false</c> when not satisfied. This comparator function
///     must impose a strict weak ordering on pairs of elements from the sequence.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     In most cases parallel_buffered_sort will show an improvement in performance over <see cref="parallel_sort Function">parallel_sort</see>, and you should
///     use it over parallel_sort if you have the memory available.
///     <para>If you do not supply a binary comparator <c>std::less</c> is used as the default, which requires the element type to provide the
///     operator <c>operator&lt;()</c>.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator, typename _Function>
inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
{
    _CONCRT_ASSERT(_Chunk_size > 0);

    // Check cancellation before the algorithm starts.
    interruption_point();

    size_t _Size = _End - _Begin;
    size_t _Core_num = Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();

    if (_Size <= _Chunk_size || _Core_num < 2)
    {
        return std::sort(_Begin, _End, _Func);
    }
    const static size_t CORE_NUM_MASK = 0x55555555;

    _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);

    // Prevent cancellation from happening during the algorithm in case it leaves buffers in unknown state.
    run_with_cancellation_token([&]() {
        // This buffered sort algorithm will divide chunks and apply parallel quicksort on each chunk. In the end, it will
        // apply parallel merge to these sorted chunks.
        //
        // We need to decide on the number of chunks to divide the input buffer into. If we divide it into n chunks, log(n)
        // merges will be needed to get the final sorted result. In this algorithm, we have two buffers for each merge
        // operation, let's say buffer A and B. Buffer A is the original input array, buffer B is the additional allocated
        // buffer. Each turn's merge will put the merge result into the other buffer; for example, if we decided to split
        // into 8 chunks in buffer A at very beginning, after one pass of merging, there will be 4 chunks in buffer B.
        // If we apply one more pass of merging, there will be 2 chunks in buffer A again.
        //
        // The problem is we want to the final merge pass to put the result back in buffer A, so that we don't need
        // one extra copy to put the sorted data back to buffer A.
        // To make sure the final result is in buffer A (original input array), we need an even number of merge passes,
        // which means log(n) must be an even number. Thus n must be a number power(2, even number). For example, when the
        // even number is 2, n is power(2, 2) = 4, when even number is 4, n is power(2, 4) = 16. When we divide chunks
        // into these numbers, the final merge result will be in the original input array. Now we need to decide the chunk(split)
        // number based on this property and the number of cores.
        //
        // We want to get a chunk (split) number close to the core number (or a little more than the number of cores),
        // and it also needs to satisfy above property. For a 8 core machine, the best chunk number should be 16, because it's
        // the smallest number that satisfies the above property and is bigger than the core number (so that we can utilize all
        // cores, a little more than core number is OK, we need to split more tasks anyway).
        //
        // In this algorithm, we will make this alignment by bit operations (it's easy and clear). For a binary representation,
        // all the numbers that satisfy power(2, even number) will be 1, 100, 10000, 1000000, 100000000 ...
        // After OR-ing these numbers together, we will get a mask (... 0101 0101 0101) which is all possible combinations of
        // power(2, even number). We use _Core_num & CORE_NUM_MASK | _Core_num << 1 & CORE_NUM_MASK, a bit-wise operation to align
        // _Core_num's highest bit into a power(2, even number).
        //
        // It means if _Core_num = 8, the highest bit in binary is bin(1000) which is not power(2, even number). After this
        // bit-wise operation, it will align to bin(10000) = 16 which is power(2, even number). If the _Core_num = 16, after
        // alignment it still returns 16. The trick is to make sure the highest bit of _Core_num will align to the "1" bit of the
        // mask bin(... 0101 0101 0101) We don't care about the other bits on the aligned result except the highest bit, because they
        // will be ignored in the function.
        _Parallel_buffered_sort_impl(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()),
            _Func, _Core_num & CORE_NUM_MASK | _Core_num << 1 & CORE_NUM_MASK, _Chunk_size);
    }, cancellation_token::none());

}

#pragma warning(push)
#pragma warning (disable: 4127)
//
// This is a default function used for parallel_radixsort which will return just the value.
// It also performs compile-time checks to ensure that the data type is integral.
//
template <typename _DataType>
struct _Radix_sort_default_function
{
    size_t operator()(const _DataType& val) const
    {
        // An instance of the type predicate returns the value if the type _DataType is one of the integral types, otherwise it
        // statically asserts.
        // An integral type is one of: bool, char, unsigned char, signed char, wchar_t, short, unsigned short, int, unsigned int, long,
        // and unsigned long.
        // In addition, with compilers that provide them, an integral type can be one of long long, unsigned long long, __int64, and
        // unsigned __int64
        static_assert(std::is_integral<_DataType>::value,
            "Type should be integral to use default radix function. For more information on integral types, please refer to http://msdn.microsoft.com/en-us/library/bb983099.aspx.");
        static_assert((sizeof(_DataType) <= sizeof(size_t)), "Passed Type is bigger than size_t.");

        if (std::is_unsigned<_DataType>::value)
        {
            return val;
        }
        else
        {
            // The default function needs to take the signed integer-like representation and map it to an unsigned one. The
            // following code will take the midpoint of the unsigned representable range (SIZE_MAX/2)+1 and does an unsigned
            // add of the value. Thus, it maps a [-signed_min,+signed_max] range into a [0, unsigned_max] range.
            return (((SIZE_MAX/2) + 1) + static_cast<size_t>(val));
        }
    }
};
#pragma warning (pop)

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator>
inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;

    _Radix_sort_default_function<_DataType> _Proj_func;

    parallel_radixsort<std::allocator<_DataType>>(_Begin, _End, _Proj_func, 256 * 256);
}

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator>
inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    _Allocator _Alloc;
    return parallel_radixsort<_Allocator, _Random_iterator>(_Alloc, _Begin, _End);
}

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <param name="_Alloc">
///     An instance of an STL compatible memory allocator.
/// </param>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator>
inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
{
    typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;

    _Radix_sort_default_function<_DataType> _Proj_func;

    parallel_radixsort<_Allocator>(_Alloc, _Begin, _End, _Proj_func);
}

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the projection function.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Proj_func">
///     A user-defined projection function object that converts an element into an integral value.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Random_iterator, typename _Function>
inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
{
    parallel_radixsort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(
        _Begin, _End, _Proj_func, _Chunk_size);
}

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the projection function.
/// </typeparam>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Proj_func">
///     A user-defined projection function object that converts an element into an integral value.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator, typename _Function>
inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
{
    _Allocator _Alloc;
    return parallel_radixsort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Proj_func, _Chunk_size);
}

/// <summary>
///     Arranges elements in a specified range into an non descending order using a radix sorting algorithm. This is a stable sort function which requires a
///     projection function that can project elements to be sorted into unsigned integer-like keys. Default initialization is required for the elements being sorted.
/// </summary>
/// <typeparam name="_Allocator">
///     The type of an STL compatible memory allocator.
/// </typeparam>
/// <typeparam name="_Random_iterator">
///     The iterator type of the input range.
/// </typeparam>
/// <typeparam name="_Function">
///     The type of the projection function.
/// </typeparam>
/// <param name="_Alloc">
///     An instance of an STL compatible memory allocator.
/// </param>
/// <param name="_Begin">
///     A random-access iterator addressing the position of the first element in the range to be sorted.
/// </param>
/// <param name="_End">
///     A random-access iterator addressing the position one past the final element in the range to be sorted.
/// </param>
/// <param name="_Proj_func">
///     A user-defined projection function object that converts an element into an integral value.
/// </param>
/// <param name="_Chunk_size">
///     The mimimum size of a chunk that will be split into two for parallel execution.
/// </param>
/// <remarks>
///     All overloads require <c>n * sizeof(T)</c> additional space, where <c>n</c> is the number of elements to be sorted, and <c>T</c> is the element type.
///     An unary projection functor with the signature<c>I _Proj_func(T)</c> is required to return a key when given an element, where <c>T</c> is the element
///     type and <c>I</c> is an unsigned integer-like type.
///     <para>If you do not supply a projection function, a default projection function which simply returns the element is used for integral types. The function
///     will fail to compile if the element is not an integral type in the absence of a projection function.</para>
///     <para>If you do not supply an allocator type or instance, the STL memory allocator <c>std::allocator&lt;T&gt;</c> is used to allocate the buffer.</para>
///     <para>The algorithm divides the input range into two chunks and successively divides each chunk into two sub-chunks for execution in parallel. The optional
///     argument <paramref name="_Chunk_size"/> can be used to indicate to the algorithm that it should handles chunks of size &lt; <paramref name="_Chunk_size"/>
///     serially.</para>
/// </remarks>
/**/
template<typename _Allocator, typename _Random_iterator, typename _Function>
inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
{
    _CONCRT_ASSERT(_Chunk_size > 0);

    // Check for cancellation before the algorithm starts.
    interruption_point();

    size_t _Size = _End - _Begin;

    // If _Size <= 1, no more sorting needs to be done.
    if (_Size <= 1)
    {
        return;
    }

    _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);

    // Prevent cancellation from happening during the algorithm in case it leaves the buffers in unknown state.
    run_with_cancellation_token([&]() {
        _Parallel_integer_sort_asc(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()), _Proj_func, _Chunk_size);
    }, cancellation_token::none());
}

#pragma pop_macro("_SORT_MAX_RECURSION_DEPTH")
#pragma pop_macro("_MAX_NUM_TASKS_PER_CORE")
#pragma pop_macro("_FINE_GRAIN_CHUNK_SIZE")
}

namespace concurrency = Concurrency;

#pragma pop_macro("new")
#pragma pack(pop)