• xchg based thing...

    From Chris M. Thomasson@21:1/5 to All on Sun Nov 26 22:14:52 2023
    Fwiw, here is a quick and dirty mock up of my atomic exchange based lifo
    work queue thing. Not sure how to classify it quite yet. I am going to
    use it as a work "queue" for my parts of my CPU based rendering engine,
    no need for it wrt my shader work. However, I needed to do it. It's
    passing Relacy tests. That's a good thing. Now I need to add in a slow
    path so it can wait in the kernel during periods of no work. I think an eventcount should work for it. Or perhaps I might integrate some state
    in the pointer values for futexes, or whatever. I have not made up my
    mind yet.

    I will port my Relacy test units over to standard C++. It's not all that
    hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange for
    its logic. It also has wait-free loopless push and flush:


    void push(work* w)
    {
    // this completes the pending logic...
    w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
    work* head = m_head.exchange(w, rl::mo_release, $);
    w->m_next.store(head, rl::mo_release, $);
    }


    work* flush()
    {
    return m_head.exchange(nullptr, rl::mo_acquire, $);
    }


    Here is my current Relacy test unit:

    https://github.com/dvyukov/relacy
    _______________________________
    #include <iostream>
    #include <relacy/relacy.hpp>


    #define CT_PRODUCERS_N 2
    #define CT_CONSUMERS_N 3
    #define CT_THREAD_N (CT_PRODUCERS_N + CT_CONSUMERS_N)


    // Notes:
    // Using spin backoff for waits as of now
    // Need to slow-path it with with kernel waits
    // Need to refine the per-node backoff


    // humm...
    #define CT_PENDING ((ct_xchg_lifo_ver_001::work*)(0xDEADBEEF))

    struct ct_xchg_lifo_ver_001
    {

    // simulate some work...
    struct work
    {
    rl::atomic<work*> m_next;
    VAR_T(unsigned long) m_work;


    work()
    : m_next(CT_PENDING),
    m_work(0)
    {

    }


    ~work()
    {
    RL_ASSERT(VAR(m_work) == 1);
    }


    // no data races on m_work!
    void execute_work()
    {
    VAR(m_work) += 1;
    }


    // this is the pending node logic...
    work* get_next()
    {
    work* next = m_next.load(rl::mo_consume, $);

    while (next == CT_PENDING)
    {
    rl::backoff();
    next = m_next.load(rl::mo_consume, $);
    }

    return next;
    }
    };



    rl::atomic<work*> m_head;

    ct_xchg_lifo_ver_001()
    : m_head(nullptr)
    {

    }

    ~ct_xchg_lifo_ver_001()
    {
    RL_ASSERT(! m_head.load(rl::mo_relaxed, $));
    }



    void push(work* w)
    {
    // this completes the pending logic...
    w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
    work* head = m_head.exchange(w, rl::mo_release, $);
    w->m_next.store(head, rl::mo_release, $);
    }


    work* flush()
    {
    return m_head.exchange(nullptr, rl::mo_acquire, $);
    }


    void process(work* cur)
    {
    // process work nodes...

    while (cur)
    {
    // do real work _before_ the pending logic
    // this is highly beneficial.
    cur->execute_work();

    // get the next work node.
    cur = cur->get_next();
    }
    }


    // dump worked on nodes...
    void destroy(work* cur)
    {
    while (cur)
    {
    work* next = cur->m_next.load(rl::mo_relaxed, $);

    // no pending work shall be destroyed!
    RL_ASSERT(next != CT_PENDING);

    delete cur;
    cur = next;
    }
    }
    };



    struct ct_relacy_test_fun : rl::test_suite<ct_relacy_test_fun, CT_THREAD_N>
    {
    ct_xchg_lifo_ver_001 m_work_lifo;

    void before()
    {

    }

    void after()
    {
    ct_xchg_lifo_ver_001::work* w = m_work_lifo.flush();
    m_work_lifo.process(w);
    m_work_lifo.destroy(w);
    }


    void producer(unsigned int tidx)
    {
    ct_xchg_lifo_ver_001::work* w = new ct_xchg_lifo_ver_001::work();
    m_work_lifo.push(w);
    }


    void consumer(unsigned int tidx)
    {
    ct_xchg_lifo_ver_001::work* w = m_work_lifo.flush();
    m_work_lifo.process(w);
    m_work_lifo.destroy(w);
    }


    void thread(unsigned int tidx)
    {
    if (tidx < CT_PRODUCERS_N)
    {
    // producer threads
    producer(tidx);
    }

    else
    {
    // consumer threads
    consumer(tidx);
    }
    }
    };


    int
    main()
    {
    std::cout << "Exchange LIFO Container Experiment ver:0.0.1\n";
    std::cout << "Relacy Unit Test ver:0.0.1\n";
    std::cout << "by: Chris M. Thomasson\n";
    std::cout << "__________________________________\n" << std::endl;

    {
    rl::test_params p;

    p.iteration_count = 400000;
    //p.execution_depth_limit = 33333;
    //p.search_type = rl::sched_bound;
    //p.search_type = rl::fair_full_search_scheduler_type;
    //p.search_type = rl::fair_context_bound_scheduler_type;

    std::cout << "Executing Relacy Unit Test...\n";
    std::cout << "__________________________________" << std::endl;

    rl::simulate<ct_relacy_test_fun>(p);
    }

    return 0;
    }
    _______________________________

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Chris M. Thomasson@21:1/5 to Chris M. Thomasson on Sun Nov 26 22:25:47 2023
    On 11/26/2023 10:14 PM, Chris M. Thomasson wrote:
    Fwiw, here is a quick and dirty mock up of my atomic exchange based lifo
    work queue thing. Not sure how to classify it quite yet. I am going to
    use it as a work "queue" for my parts of my CPU based rendering engine,
    no need for it wrt my shader work. However, I needed to do it. It's
    passing Relacy tests. That's a good thing. Now I need to add in a slow
    path so it can wait in the kernel during periods of no work. I think an eventcount should work for it. Or perhaps I might integrate some state
    in the pointer values for futexes, or whatever. I have not made up my
    mind yet.

    I will port my Relacy test units over to standard C++. It's not all that hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange for
    its logic. It also has wait-free loopless push and flush:


        void push(work* w)
        {
            // this completes the pending logic...
            w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
            work* head = m_head.exchange(w, rl::mo_release, $);
            w->m_next.store(head, rl::mo_release, $);
        }


        work* flush()
        {
            return m_head.exchange(nullptr, rl::mo_acquire, $);
        }


    Here is my current Relacy test unit:

    https://github.com/dvyukov/relacy
    _______________________________
    [...]
        void push(work* w)
        {
            // this completes the pending logic...
            w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
            work* head = m_head.exchange(w, rl::mo_release, $);

            w->m_next.store(head, rl::mo_release, $);
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    I also think I can use relaxed for that store! Humm... I just need to
    test it. Tonight is going to be fun. :^)



        }
    [...]

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Chris M. Thomasson@21:1/5 to Chris M. Thomasson on Sun Nov 26 22:20:31 2023
    On 11/26/2023 10:14 PM, Chris M. Thomasson wrote:
    Fwiw, here is a quick and dirty mock up of my atomic exchange based lifo
    work queue thing. Not sure how to classify it quite yet. I am going to
    use it as a work "queue" for my parts of my CPU based rendering engine,
    no need for it wrt my shader work. However, I needed to do it. It's
    passing Relacy tests. That's a good thing. Now I need to add in a slow
    path so it can wait in the kernel during periods of no work. I think an eventcount should work for it. Or perhaps I might integrate some state
    in the pointer values for futexes, or whatever. I have not made up my
    mind yet.

    I will port my Relacy test units over to standard C++. It's not all that hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange for
    its logic. It also has wait-free loopless push and flush:


        void push(work* w)
        {
            // this completes the pending logic...
            w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
            work* head = m_head.exchange(w, rl::mo_release, $);
            w->m_next.store(head, rl::mo_release, $);
        }


        work* flush()
        {
            return m_head.exchange(nullptr, rl::mo_acquire, $);
        }


    Here is my current Relacy test unit:

    https://github.com/dvyukov/relacy
    _______________________________
    #include <iostream>
    #include <relacy/relacy.hpp>


    #define CT_PRODUCERS_N 2
    #define CT_CONSUMERS_N 3
    #define CT_THREAD_N (CT_PRODUCERS_N + CT_CONSUMERS_N)


    // Notes:
    // Using spin backoff for waits as of now
    // Need to slow-path it with with kernel waits
    // Need to refine the per-node backoff


    // humm...
    #define CT_PENDING ((ct_xchg_lifo_ver_001::work*)(0xDEADBEEF))

    struct ct_xchg_lifo_ver_001
    {
    [...]
            // this is the pending node logic...
            work* get_next()
            {
                work* next = m_next.load(rl::mo_consume, $);
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Now, I think I might be able to get away with using a relaxed membar
    here instead of consume. I think so because of the acquire release
    relation ship between the push and flush functions. This is were Relacy
    can come in handy. I will have some more time to work on it later on
    tonight. I need to test that. I don't think a consume membar is needed
    here. I think relaxed should work fine. Not exactly sure right now.





                while (next == CT_PENDING)
                {
                    rl::backoff();
                    next = m_next.load(rl::mo_consume, $);
                }

                return next;
            }
        };
    [...]

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Chris M. Thomasson@21:1/5 to Chris M. Thomasson on Mon Nov 27 18:34:22 2023
    On 11/27/2023 6:32 PM, Chris M. Thomasson wrote:
    On 11/26/2023 10:14 PM, Chris M. Thomasson wrote:
    Fwiw, here is a quick and dirty mock up of my atomic exchange based
    lifo work queue thing. Not sure how to classify it quite yet. I am
    going to use it as a work "queue" for my parts of my CPU based
    rendering engine, no need for it wrt my shader work. However, I needed
    to do it. It's passing Relacy tests. That's a good thing. Now I need
    to add in a slow path so it can wait in the kernel during periods of
    no work. I think an eventcount should work for it. Or perhaps I might
    integrate some state in the pointer values for futexes, or whatever. I
    have not made up my mind yet.

    I will port my Relacy test units over to standard C++. It's not all
    that hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange
    for its logic. It also has wait-free loopless push and flush:
    [...]

    Fwiw, here is just a little exploration into distributing it. I will
    make a home for it on GitHub. Check it out, my simple first venture into distributing the work across this experiment wrt n-threads. Take careful notice of local work vs. foreign work.


    I managed to streamline the memory barriers to the weakest possible. Got
    rid of a release membar, and a consume membar. I don't think it can get
    any weaker without being incorrect.

    ___________________________
    #include <iostream>
    #include <cstddef>
    #include <relacy/relacy.hpp>


    #define CT_THREAD_N 5
    #define CT_WORK_N 3
    #define CT_PRODUCER_WORK_N 5
    #define CT_CONSUMER_WORK_N 2
    #define CT_DISTRIBUTE_WORK_N CT_THREAD_N


    // Notes:
    // Using spin backoff for waits as of now
    // Need to slow-path it with with kernel waits
    // Need to refine the per-node backoff


    // humm...
    #define CT_PENDING ((ct_xchg_lifo_ver_001::work*)(0xDEADBEEF))

    static unsigned long g_stats_local_work = 0;
    static unsigned long g_stats_foreign_work = 0;


    struct ct_xchg_lifo_ver_001
    {

        // simulate some work...
        struct work
        {
            rl::atomic<work*> m_next;
            VAR_T(unsigned long) m_work;
            VAR_T(unsigned long) m_tidx;


            work(unsigned long tidx)
            :   m_next(CT_PENDING),
                m_work(0),
                m_tidx(tidx)
            {

            }


            ~work()
            {
                RL_ASSERT(VAR(m_work) == 1);
            }


            // no data races on m_work!
            void execute_work(unsigned long tidx)
            {
                VAR(m_work) += 1;

                unsigned long local_tidx = VAR(m_tidx);

                if (local_tidx != tidx)
                {
                    g_stats_foreign_work += 1;
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Fwiw, this is an out of bound stat that I am using. If I were keeping
    stats in a real C++ impl, I would use split counters.
    [...]

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Chris M. Thomasson@21:1/5 to Chris M. Thomasson on Mon Nov 27 18:32:46 2023
    On 11/26/2023 10:14 PM, Chris M. Thomasson wrote:
    Fwiw, here is a quick and dirty mock up of my atomic exchange based lifo
    work queue thing. Not sure how to classify it quite yet. I am going to
    use it as a work "queue" for my parts of my CPU based rendering engine,
    no need for it wrt my shader work. However, I needed to do it. It's
    passing Relacy tests. That's a good thing. Now I need to add in a slow
    path so it can wait in the kernel during periods of no work. I think an eventcount should work for it. Or perhaps I might integrate some state
    in the pointer values for futexes, or whatever. I have not made up my
    mind yet.

    I will port my Relacy test units over to standard C++. It's not all that hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange for
    its logic. It also has wait-free loopless push and flush:
    [...]

    Fwiw, here is just a little exploration into distributing it. I will
    make a home for it on GitHub. Check it out, my simple first venture into distributing the work across this experiment wrt n-threads. Take careful
    notice of local work vs. foreign work.


    I managed to streamline the memory barriers to the weakest possible. Got
    rid of a release membar, and a consume membar. I don't think it can get
    any weaker without being incorrect.

    ___________________________
    #include <iostream>
    #include <cstddef>
    #include <relacy/relacy.hpp>


    #define CT_THREAD_N 5
    #define CT_WORK_N 3
    #define CT_PRODUCER_WORK_N 5
    #define CT_CONSUMER_WORK_N 2
    #define CT_DISTRIBUTE_WORK_N CT_THREAD_N


    // Notes:
    // Using spin backoff for waits as of now
    // Need to slow-path it with with kernel waits
    // Need to refine the per-node backoff


    // humm...
    #define CT_PENDING ((ct_xchg_lifo_ver_001::work*)(0xDEADBEEF))

    static unsigned long g_stats_local_work = 0;
    static unsigned long g_stats_foreign_work = 0;


    struct ct_xchg_lifo_ver_001
    {

    // simulate some work...
    struct work
    {
    rl::atomic<work*> m_next;
    VAR_T(unsigned long) m_work;
    VAR_T(unsigned long) m_tidx;


    work(unsigned long tidx)
    : m_next(CT_PENDING),
    m_work(0),
    m_tidx(tidx)
    {

    }


    ~work()
    {
    RL_ASSERT(VAR(m_work) == 1);
    }


    // no data races on m_work!
    void execute_work(unsigned long tidx)
    {
    VAR(m_work) += 1;

    unsigned long local_tidx = VAR(m_tidx);

    if (local_tidx != tidx)
    {
    g_stats_foreign_work += 1;
    /*
    std::cout << "foriegn work detected "
    << local_tidx
    << " -> "
    << tidx
    << std::endl;
    */
    }

    else
    {
    g_stats_local_work += 1;

    /*
    std::cout << "local work detected "
    << local_tidx
    << " -> "
    << tidx
    << std::endl;
    */
    }
    }


    // this is the pending node logic...
    work* get_next()
    {
    work* next = m_next.load(rl::mo_relaxed, $);

    while (next == CT_PENDING)
    {
    rl::backoff();
    next = m_next.load(rl::mo_relaxed, $);
    }

    return next;
    }



    static void process(work* cur, unsigned long tidx)
    {
    // must not be pending!
    RL_ASSERT(cur != CT_PENDING);

    // process work nodes...

    while (cur)
    {
    // do real work _before_ the pending logic
    // this is highly beneficial... Big time!
    cur->execute_work(tidx);

    // get the next work node.
    cur = cur->get_next();
    }
    }


    // dump worked on nodes...
    static void destroy(work* cur, unsigned long tidx)
    {
    // no pending work shall be destroyed!
    RL_ASSERT(cur != CT_PENDING);

    while (cur)
    {
    work* next = cur->m_next.load(rl::mo_relaxed, $);

    // no pending work shall be destroyed!
    RL_ASSERT(next != CT_PENDING);

    delete cur;
    cur = next;
    }
    }
    };



    rl::atomic<work*> m_head;

    ct_xchg_lifo_ver_001()
    : m_head(nullptr)
    {

    }

    ~ct_xchg_lifo_ver_001()
    {
    RL_ASSERT(! m_head.load(rl::mo_relaxed, $));
    }



    void push(work* w)
    {
    // this completes the pending logic...
    w->m_next.store(CT_PENDING, rl::mo_relaxed, $);
    work* head = m_head.exchange(w, rl::mo_release, $);
    w->m_next.store(head, rl::mo_relaxed, $);
    }


    work* flush()
    {
    return m_head.exchange(nullptr, rl::mo_acquire, $);
    }
    };



    // A test of a simple way to distribute work
    // Using the ct_xchg_lifo_ver_001 API
    template<typename T, std::size_t T_N>
    struct ct_distribute_ver_001
    {
    T m_table[T_N];
    typedef typename T::work work;


    void push(work* w, unsigned int tidx)
    {
    T& t = m_table[tidx % T_N];
    t.push(w);
    }


    work* pop(unsigned int tidx)
    {
    for (std::size_t i = 0; i < T_N; ++i)
    {
    T& t = m_table[(i + tidx) % T_N];
    work* w = t.flush();
    if (w) return w;
    }

    return nullptr;
    }
    };



    // Relacy Test Unit...
    struct ct_relacy_test_fun
    : rl::test_suite<ct_relacy_test_fun, CT_THREAD_N>
    {
    typedef ct_distribute_ver_001<
    ct_xchg_lifo_ver_001,
    CT_DISTRIBUTE_WORK_N
    > distribute;


    distribute m_work;


    void thread(unsigned int tidx)
    {
    for (unsigned long wi = 0; wi < CT_WORK_N; ++wi)
    {
    for (unsigned long i = 0; i < CT_PRODUCER_WORK_N; ++i)
    {
    distribute::work* w = new distribute::work(tidx);
    m_work.push(w, tidx);
    }

    for (unsigned long i = 0; i < CT_CONSUMER_WORK_N; ++i)
    {
    distribute::work* w = m_work.pop(tidx);
    distribute::work::process(w, tidx);
    distribute::work::destroy(w, tidx);
    }
    }
    }
    };


    int
    main()
    {
    std::cout << "Exchange LIFO Container Experiment ver:0.0.2\n";
    std::cout << "Relacy Unit Test ver:0.0.2\n";
    std::cout << "by: Chris M. Thomasson\n";
    std::cout << "__________________________________\n" << std::endl;

    {
    rl::test_params p;

    p.iteration_count = 1000000;
    //p.execution_depth_limit = 33333;
    //p.search_type = rl::sched_bound;
    //p.search_type = rl::fair_full_search_scheduler_type;
    //p.search_type = rl::fair_context_bound_scheduler_type;

    std::cout << "Executing Relacy Unit Test...\n";
    std::cout << "__________________________________" << std::endl;

    rl::simulate<ct_relacy_test_fun>(p);


    std::cout << "\nwork load "
    << g_stats_local_work
    << ", "
    << g_stats_foreign_work
    << std::endl;
    }

    return 0;
    }
    ___________________________

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Chris M. Thomasson@21:1/5 to Chris M. Thomasson on Wed Nov 29 15:29:47 2023
    On 11/27/2023 6:34 PM, Chris M. Thomasson wrote:
    On 11/27/2023 6:32 PM, Chris M. Thomasson wrote:
    On 11/26/2023 10:14 PM, Chris M. Thomasson wrote:
    Fwiw, here is a quick and dirty mock up of my atomic exchange based
    lifo work queue thing. Not sure how to classify it quite yet. I am
    going to use it as a work "queue" for my parts of my CPU based
    rendering engine, no need for it wrt my shader work. However, I
    needed to do it. It's passing Relacy tests. That's a good thing. Now
    I need to add in a slow path so it can wait in the kernel during
    periods of no work. I think an eventcount should work for it. Or
    perhaps I might integrate some state in the pointer values for
    futexes, or whatever. I have not made up my mind yet.

    I will port my Relacy test units over to standard C++. It's not all
    that hard. In fact, it's rather straightforward.

    Speaking of pointer values, take note of CT_PENDING.

    The fun part about this scheme is that it only uses atomic exchange
    for its logic. It also has wait-free loopless push and flush:
    [...]

    Fwiw, here is just a little exploration into distributing it. I will
    make a home for it on GitHub. Check it out, my simple first venture
    into distributing the work across this experiment wrt n-threads. Take
    careful notice of local work vs. foreign work.


    I managed to streamline the memory barriers to the weakest possible.
    Got rid of a release membar, and a consume membar. I don't think it
    can get any weaker without being incorrect.

    ___________________________
    #include <iostream>
    #include <cstddef>
    #include <relacy/relacy.hpp>


    #define CT_THREAD_N 5
    #define CT_WORK_N 3
    #define CT_PRODUCER_WORK_N 5
    #define CT_CONSUMER_WORK_N 2
    #define CT_DISTRIBUTE_WORK_N CT_THREAD_N


    // Notes:
    // Using spin backoff for waits as of now
    // Need to slow-path it with with kernel waits
    // Need to refine the per-node backoff


    // humm...
    #define CT_PENDING ((ct_xchg_lifo_ver_001::work*)(0xDEADBEEF))

    static unsigned long g_stats_local_work = 0;
    static unsigned long g_stats_foreign_work = 0;


    struct ct_xchg_lifo_ver_001
    {

         // simulate some work...
         struct work
         {
             rl::atomic<work*> m_next;
             VAR_T(unsigned long) m_work;
             VAR_T(unsigned long) m_tidx;


             work(unsigned long tidx)
             :   m_next(CT_PENDING),
                 m_work(0),
                 m_tidx(tidx)
             {

             }


             ~work()
             {
                 RL_ASSERT(VAR(m_work) == 1);
             }


             // no data races on m_work!
             void execute_work(unsigned long tidx)
             {
                 VAR(m_work) += 1;

                 unsigned long local_tidx = VAR(m_tidx);

                 if (local_tidx != tidx)
                 {
                     g_stats_foreign_work += 1;
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Fwiw, this is an out of bound stat that I am using. If I were keeping
    stats in a real C++ impl, I would use split counters.
    [...]

    Actually, I should refer to the stats as out-of-band state that is
    outside of the Relacy unit test.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)