Porting the Arduino millis(), possible? Topic is solved

Technical questions regarding the XTC tools and programming with XMOS.
User avatar
aclassifier
Respected Member
Posts: 507
Joined: Wed Apr 25, 2012 8:52 pm

Porting the Arduino millis(), possible?

Post by aclassifier »

I have struggled with porting the arduino millis() [1] code. I ended up with four more or less general timer cases:
  • C that works with and XC that fails. (Code piece 1 below)
    The C-code for the Arduino is “blocking” and there is no support for multi-threading. However, my first trial to port this to XC failed
  • XC that works with 0.65536 ms resolution. (Code piece 2 below)
    But a version where the 1 ms resolution of the Arduino millis() had to be ported as 0.65536 ms ticks in XC works. The problem is how “modulo 1 ms” arithmetics is done when the timer word with is 32 bits and it increments by the XMOS processor every 10 ns. So 10 ns * 2exp16 -> 0.65536 ms. This timing is done “inline”, but since anything XC is multi-threading (multi-task) that doesn’t really harm it. The problem is I can’t order a 100 ms timeout (but I could do 152 ticks = 99.61472 ms)
  • XC that works with 1 ms resolution. (Code at blog note)
    Here we have proper 1 ms resolution, done by “proper” handling by an XC timer, by storing a future timeout-value in proper 32 bits width. The code is “inline” as above
  • XC server with state-based timing serving two clients. (Code at blog note)
    Here the timing is handled in a proper select case. I guess this is closest to idiomatic XC. The server serves two clients, one that fails and one that succeeds in reading a register. Impressingly the handling of the two clients is “fair”, ie. none of them jam the other
All of the code and the logs are shown in a blog note [2]. However, I will show code for (1) (that fails) and (2) above that works on 0.65536 ms:

First code "C that works with and XC that fails" (1):

Code: Select all

#include <stdio.h>
#include <xs1.h>
#include <iso646.h>
#include <timer.h> // delay_milliseconds(200), XS1_TIMER_HZ etc

typedef enum {false,true} bool;

#define DEBUG_PRINT_TEST 1
#define debug_print(fmt, ...) do { if(DEBUG_PRINT_TEST) printf(fmt, __VA_ARGS__); } while (0)

signed millis () {
    timer tmr;
    signed current_time;
    tmr :> current_time;
    return (current_time / XS1_TIMER_KHZ); // Never works! XS_TIMER_KHZ is 100*1000
}

unsigned digitalRead (void) {
    return (0);
}

#define RF69_TX_LIMIT_MS 100
#define TEST_TYPE signed // Too wide and does not match millis()
#define MILLIS() millis()

void test_task (void) {

    TEST_TYPE millis_;
    TEST_TYPE txStart;
    unsigned  testCnt = 0;
    bool      not_timed_out;

    while (testCnt < 500) {
        txStart = MILLIS();
        do {
            millis_ = MILLIS();
            not_timed_out = (millis_ - txStart) < RF69_TX_LIMIT_MS;
            debug_print    ("testCnt(%d), millis(%d), txStart(%d), millis_-txStart(%d), timedOut(%d)\n",
                    testCnt, millis_, txStart, millis_ - txStart, !not_timed_out);
            delay_milliseconds(20);
        } while ((digitalRead() == 0) && not_timed_out);
        testCnt++;
    }
}

int main() {
    par {
        test_task();
    }
    return 0;
}

/* DOES NOT WORK!
testCnt(351), millis(42729), txStart(42629), millis_-txStart(100), timedOut(1)
testCnt(352), millis(42749), txStart(42749), millis_-txStart(0), timedOut(0)
testCnt(352), millis(42769), txStart(42749), millis_-txStart(20), timedOut(0)
testCnt(352), millis(42789), txStart(42749), millis_-txStart(40), timedOut(0)
testCnt(352), millis(42809), txStart(42749), millis_-txStart(60), timedOut(0)
testCnt(352), millis(42829), txStart(42749), millis_-txStart(80), timedOut(0)
testCnt(352), millis(42849), txStart(42749), millis_-txStart(100), timedOut(1) LAST TIMEOUT EVER!
testCnt(353), millis(42869), txStart(42869), millis_-txStart(0), timedOut(0)
testCnt(353), millis(42890), txStart(42869), millis_-txStart(21), timedOut(0)
testCnt(353), millis(42910), txStart(42869), millis_-txStart(41), timedOut(0)
testCnt(353), millis(42930), txStart(42869), millis_-txStart(61), timedOut(0)
testCnt(353), millis(0), txStart(42869), millis_-txStart(-42869), timedOut(0)
testCnt(353), millis(20), txStart(42869), millis_-txStart(-42849), timedOut(0)
testCnt(353), millis(40), txStart(42869), millis_-txStart(-42829), timedOut(0)
testCnt(353), millis(60), txStart(42869), millis_-txStart(-42809), timedOut(0)
testCnt(353), millis(80), txStart(42869), millis_-txStart(-42789), timedOut(0)
testCnt(353), millis(100), txStart(42869), millis_-txStart(-42769), timedOut(0)
testCnt(353), millis(120), txStart(42869), millis_-txStart(-42749), timedOut(0)
testCnt(353), millis(140), txStart(42869), millis_-txStart(-42729), timedOut(0)
*/
Then the code "that works with 0.65536 ms resolution" (2) (6Mar2018: updated to more general code with the introduction of a "fast millisecond" (fms) !-):

Code: Select all

#include <stdio.h>
#include <xs1.h>
#include <iso646.h>
#include <timer.h>     // delay_milliseconds(200), XS1_TIMER_HZ etc

typedef enum {false,true} bool;

#define DEBUG_PRINT_TEST 1
#define debug_print(fmt, ...) do { if(DEBUG_PRINT_TEST) printf(fmt, __VA_ARGS__); } while (0)

// DEFINING A FMS-TICK AS A "fast ms" (fms, FMS) - AND IT IS 0.65536 ms (2exp16=65536)
// One fms-tick is 100 MHz XMOS system-tick (10 ns) into a 16 bits word every 65536 system-tick
//
typedef signed short time16_fms_t; // fms=fast ms
//
#define FAST_MILLIS_PER_10MS    15 //   10 / .65536
#define FAST_MILLIS_PER_100MS  153 //  100 / .65536
#define FAST_MILLIS_PER_1S    1526 // 1000 / .65536
//
#define MS_TO_FMS(ms) ((ms*FAST_MILLIS_PER_1S)/1000)

time16_fms_t fms() { // fms=fast ms. Returns one tick as 0.65536 ms (10ns * 65536)
    timer tmr;           // 32 bits
    signed current_time; // 32 bits
    tmr :> current_time; // 32 bits
    return (time16_fms_t) (current_time >> 16); // 16 bits. Keep sign bit (or use / 65536)
}

unsigned digitalRead (void) {
    return (0);
}

#define RF69_TX_LIMIT_MS    100
#define RF69_TX_LIMIT_FMS   MS_TO_FMS (RF69_TX_LIMIT_MS)

void test_task (void) {

    time16_fms_t now_fms;
    time16_fms_t txStart_fms;
    time16_fms_t diff_fms;

    unsigned  testCnt = 0;
    bool      not_timed_out;
    debug_print ("100 ms is %d ticks\n",RF69_TX_LIMIT_FMS);

    while (testCnt < 500) {
        txStart_fms = fms();
        do {
            now_fms = fms();
            diff_fms = now_fms - txStart_fms;
            not_timed_out = diff_fms < RF69_TX_LIMIT_FMS;
            debug_print    ("testCnt(%d), now_fms(%d), txStart_fms(%d), diff_fms(%d), timedOut(%d)\n",
                    testCnt, now_fms, txStart_fms, diff_fms, !not_timed_out);
            delay_milliseconds (RF69_TX_LIMIT_MS/2); // TRUE 50 ms!
        } while ((digitalRead() == 0) && not_timed_out);
        testCnt++;
    }
}

int main() {
    par {
        test_task();
    }
    return 0;
}
/* WORKS:
100 ms is 152 ticks
testCnt(0), now_fms(581), txStart_fms(581), diff_fms(0), timedOut(0)
testCnt(0), now_fms(657), txStart_fms(581), diff_fms(76), timedOut(0)
testCnt(0), now_fms(734), txStart_fms(581), diff_fms(153), timedOut(1)
testCnt(1), now_fms(810), txStart_fms(810), diff_fms(0), timedOut(0)
...
testCnt(139), now_fms(32586), txStart_fms(32434), diff_fms(152), timedOut(1)
testCnt(140), now_fms(32663), txStart_fms(32663), diff_fms(0), timedOut(0)
testCnt(140), now_fms(32739), txStart_fms(32663), diff_fms(76), timedOut(0)
testCnt(140), now_fms(-32719), txStart_fms(32663), diff_fms(154), timedOut(1)
testCnt(141), now_fms(-32643), txStart_fms(-32643), diff_fms(0), timedOut(0)
testCnt(141), now_fms(-32567), txStart_fms(-32643), diff_fms(76), timedOut(0)
testCnt(141), now_fms(-32490), txStart_fms(-32643), diff_fms(153), timedOut(1)
*/
See "diff" in the last code. I won't say any more, you readers here would take this better without my words.

I guess my question is, has anyone of you made a true millis() equivalent?

I guess it would be possible by writing code for a 100*1000 modulo sized arithmetics? I guess it would need plus, minus, mult and div. Or is there some magically simpler solution that I haven't thought of? I might have dug myself into a hole!-(

[1] https://www.arduino.cc/reference/en/lan ... me/millis/
[2] http://www.teigfam.net/oyvind/home/tech ... th_timeout Disclaimer: no money, no ads, no gifts, only fun and expenses on all my blog notes!
--
Øyvind Teig
Trondheim (Norway)
https://www.teigfam.net/oyvind/home/
View Solution
User avatar
aclassifier
Respected Member
Posts: 507
Joined: Wed Apr 25, 2012 8:52 pm

Post by aclassifier »

Since I haven't had any comments on this then maybe it's true that we can't make a true "millis(..)" function in XC? Yes, I know about the delay_milliseconds(..), but that's not what I am after. I want to read some value that ticks as true millis. I have the code to get it right (above), but it's another mechanism.
--
Øyvind Teig
Trondheim (Norway)
https://www.teigfam.net/oyvind/home/
User avatar
akp
XCore Expert
Posts: 580
Joined: Thu Nov 26, 2015 11:47 pm

Post by akp »

I came up with two ideas for you wrt your first implementation and updated test_task() to enable parallel instantiation to start to check for race conditions. I suppose tmr will overflow after about 43 seconds (not sure) so you'd have to call it with an interval less than the overflow time for it to work. I don't know if something like this is what you really want or not.

EDIT: I used debug_printf.h since I think it's faster than stdio.h printf

Code: Select all

#include "xc_ptr.h"

unsigned long g_current_ms = 0;
unsigned long g_last_tick = 0;

unsigned long millis_g () {
    timer tmr;
    unsigned long current_ms;
    unsigned long last_tick;
    unsigned current_tick;
    /* Possible race condition if accessed from multiple cores? */
    tmr :> current_tick;
    GET_SHARED_GLOBAL(last_tick, g_last_tick);
    GET_SHARED_GLOBAL(current_ms, g_current_ms);
    current_ms += (unsigned long)(current_tick - last_tick) / XS1_TIMER_KHZ;
    last_tick = current_tick;
    SET_SHARED_GLOBAL(g_last_tick, last_tick);
    SET_SHARED_GLOBAL(g_current_ms, current_ms);
    /* End possible race condition */
    return current_ms;
}

typedef struct millis_state_t {
    unsigned long current_ms;
    unsigned last_tick;
} millis_state_t;

unsigned long millis_p (millis_state_t &millis_state) {
    timer tmr;
    unsigned current_tick;
    tmr :> current_tick;
    millis_state.current_ms += (unsigned long)(current_tick - millis_state.last_tick) / XS1_TIMER_KHZ;
    millis_state.last_tick = current_tick;
    return millis_state.current_ms;
}

#define RF69_TX_LIMIT_MS 100
#define TEST_TYPE unsigned long
#define MILLIS() millis_p(my_millis) // or millis_g() to use the shared globals

void test_task (int task_num) {

    TEST_TYPE millis_;
    TEST_TYPE txStart;
    unsigned  testCnt = 0;
    bool      not_timed_out;
    millis_state_t my_millis = {0,0}; // if you use millis_p() function

    while (testCnt < 2000) {
        txStart = MILLIS();
        do {
            millis_ = MILLIS();
            not_timed_out = (millis_ - txStart) < RF69_TX_LIMIT_MS;
            debug_printf    ("task %d: testCnt(%d), millis(%d), txStart(%d), millis_-txStart(%d), timedOut(%d)\n",
                    task_num, testCnt, millis_, txStart, millis_ - txStart, !not_timed_out);
            delay_milliseconds(20);
        } while ((digitalRead() == 0) && not_timed_out);
        testCnt++;
    }
}
User avatar
akp
XCore Expert
Posts: 580
Joined: Thu Nov 26, 2015 11:47 pm

Post by akp »

I believe my implementation that uses globals may lose some time based on a quick test on a dev board, possibly indicative that the race condition is a real concern. So if you have lots of memory and you're not dead set on having a call that exactly matches millis(), then my millis_p() function might be better. Otherwise some kind of lock on the shared globals might be necessary.
User avatar
aclassifier
Respected Member
Posts: 507
Joined: Wed Apr 25, 2012 8:52 pm

Post by aclassifier »

Starting to test (compile as a start) this now. Where/how do you find "xc_ptr.h"?

I actually have a "standard" solution, see my blog notes. But this is still really interesting, and perhaps even useful!

I didn't envisage millis_p since the global situation shorted my thinking. When you also have a millis_g then that's great! But I wonder how the visibiliity over different tasks, files will be for millis_g?
--
Øyvind Teig
Trondheim (Norway)
https://www.teigfam.net/oyvind/home/
User avatar
aclassifier
Respected Member
Posts: 507
Joined: Wed Apr 25, 2012 8:52 pm

Post by aclassifier »

I see a GET_SHARED_GLOBAL in Application Note: AN01024 xCONNECT dynamic configuration demo, but couldn't find traces of xc_ptr.h out there..

XC timer is a global variable for the shared system timer. So we simply need a a global timer_ms etc., don't we?

May take dy or two..
Last edited by aclassifier on Mon Mar 19, 2018 6:10 pm, edited 1 time in total.
--
Øyvind Teig
Trondheim (Norway)
https://www.teigfam.net/oyvind/home/
User avatar
akp
XCore Expert
Posts: 580
Joined: Thu Nov 26, 2015 11:47 pm

Post by akp »

millis_g will work for multiple tasks on the same tile, doubtful of multiple tiles (not sure) I just tested it on a single tile. However, I am pretty sure it has a race condition (on the globals) and might not be that accurate at the end of the day without using a synchronization mechanism.

xc_ptr.h is in the useful module_xc_ptr from https://github.com/xcore/sc_util

If you can get started I would be interested to hear the result of your optimizations and tests.
User avatar
akp
XCore Expert
Posts: 580
Joined: Thu Nov 26, 2015 11:47 pm

Post by akp »

Looks like I probably have an arithmetic error, will be fixing.
User avatar
akp
XCore Expert
Posts: 580
Joined: Thu Nov 26, 2015 11:47 pm

Post by akp »

I forgot I was losing fractional msec... here is my fix, just extending the tick counter to 64 bits. Also removed the dependency on xc_ptr.h. I think the race condition on millis_g is pretty bad because you could jump 43 seconds if you get a double increment of the g_tick_hi variable, I didn't really analyze it.

Code: Select all

#ifndef GET_SHARED_GLOBAL
#define GET_SHARED_GLOBAL(x, g) asm("ldw %0, dp[" #g "]":"=r"(x))
#endif
#ifndef SET_SHARED_GLOBAL
#define SET_SHARED_GLOBAL(g, v) asm("stw %0, dp[" #g "]"::"r"(v))
#endif
unsigned long g_tick_hi = 0;
unsigned long g_last_tick = 0;

unsigned long millis_g () {
    timer tmr;
    unsigned long long total_ticks;
    unsigned long last_tick;
    unsigned long tick_hi;
    unsigned long current_tick;
    /* Possible race condition if accessed from multiple cores? */
    tmr :> current_tick;
    GET_SHARED_GLOBAL(tick_hi, g_tick_hi);
    GET_SHARED_GLOBAL(last_tick, g_last_tick);
    if( current_tick < last_tick ) {
        ++tick_hi;
        SET_SHARED_GLOBAL(g_tick_hi, tick_hi);
    }
    SET_SHARED_GLOBAL(g_last_tick, current_tick);
    /* End possible race condition */
    total_ticks = ((unsigned long long)tick_hi << 32) | (unsigned long long)current_tick;
    return (unsigned long)(total_ticks / (unsigned long long)XS1_TIMER_KHZ);
}

typedef struct millis_state_t {
    unsigned long tick_hi;
    unsigned long last_tick;
} millis_state_t;

unsigned long millis_p (millis_state_t &millis_state) {
    timer tmr;
    unsigned long long total_ticks;
    unsigned long current_tick;
    tmr :> current_tick;
    if( current_tick < millis_state.last_tick ) {
        ++millis_state.tick_hi;
    }
    millis_state.last_tick = current_tick;
    total_ticks = ((unsigned long long)millis_state.tick_hi << 32) | (unsigned long long)current_tick;
    return (unsigned long)(total_ticks / (unsigned long long)XS1_TIMER_KHZ);
}
User avatar
aclassifier
Respected Member
Posts: 507
Joined: Wed Apr 25, 2012 8:52 pm

Post by aclassifier »

Great!
Off Topic
Since when did XC support 64-bit values? In xTIMEcomposer User Guide chapter 48XS1 Data Types it doesn't. I have searched through some of the release notes, like this but failed to find anything about "64". If someone could give me that answer I'd update here
I have now compiled this code, courtesy you (below). Pardon the #indentation, but it probably looks better here.

I guess that you are saying that the race condition is still in the global code below, since the values are stored in shared RAM that may be updated in between another core and not in, like a HW timer that runs with a nice monotonoeus increase? However, a standard timer overflows every 43 seconds and we don't get the correct result if we check a timeout like every once per minute. I guess that we are race-free if we have the same requirement here? (Not that I completely see why the race is there only when it's incremented by 2 or more, a race is a race even for a one increment? You mean they may have the same increment.. No. How do we know that doesn't hurt? It could be a one and none that would yield -1... (just thinking here, not fine studied the code)).

Code: Select all

#include <stdio.h>
#include <xs1.h>
#include <iso646.h>
#include <timer.h> // delay_milliseconds(200), XS1_TIMER_HZ etc

typedef enum {false,true} bool;

#define DEBUG_PRINT_TEST 1
#define debug_print(fmt, ...) do { if(DEBUG_PRINT_TEST) printf(fmt, __VA_ARGS__); } while (0)

// -- AKP --

#ifndef GET_SHARED_GLOBAL
#define GET_SHARED_GLOBAL(x, g) asm("ldw %0, dp[" #g "]":"=r"(x))
#endif
#ifndef SET_SHARED_GLOBAL
#define SET_SHARED_GLOBAL(g, v) asm("stw %0, dp[" #g "]"::"r"(v))
#endif

#define USE_GLOBAL 0

#if (USE_GLOBAL == 1)
    unsigned long g_tick_hi = 0;
    unsigned long g_last_tick = 0;

    unsigned long millis_g () {
        timer tmr;
        unsigned long long total_ticks;
        unsigned long last_tick;
        unsigned long tick_hi;
        unsigned long current_tick;
        /* Possible race condition if accessed from multiple cores? */
        tmr :> current_tick;
        GET_SHARED_GLOBAL(tick_hi, g_tick_hi);
        GET_SHARED_GLOBAL(last_tick, g_last_tick);
        if( current_tick < last_tick ) {
            ++tick_hi;
            SET_SHARED_GLOBAL(g_tick_hi, tick_hi);
        }
        SET_SHARED_GLOBAL(g_last_tick, current_tick);
        /* End possible race condition */
        total_ticks = ((unsigned long long)tick_hi << 32) | (unsigned long long)current_tick;
        return (unsigned long)(total_ticks / (unsigned long long)XS1_TIMER_KHZ);
    }
    #define MILLIS() millis_g()
#else // USE_GLOBAL == 0
    typedef struct millis_state_t {
        unsigned long tick_hi;
        unsigned long last_tick;
    } millis_state_t;

    unsigned long millis_p (millis_state_t &millis_state) {
        timer tmr;
        unsigned long long total_ticks;
        unsigned long current_tick;
        tmr :> current_tick;
        if( current_tick < millis_state.last_tick ) {
            ++millis_state.tick_hi;
        }
        millis_state.last_tick = current_tick;
        total_ticks = ((unsigned long long)millis_state.tick_hi << 32) | (unsigned long long)current_tick;
        return (unsigned long)(total_ticks / (unsigned long long)XS1_TIMER_KHZ);
    }
    #define MILLIS() millis_p(my_millis)
#endif

    unsigned digitalRead (void) {
    delay_milliseconds (200);
    return (0);
}

#define RF69_TX_LIMIT_MS 1000
#define TEST_TYPE unsigned long

void test_task (int task_num) {

    TEST_TYPE millis_;
    TEST_TYPE txStart;
    unsigned  testCnt = 0;
    bool      not_timed_out;
    #if (USE_GLOBAL == 0)
        millis_state_t my_millis = {0,0};
    #endif

    while (testCnt < 2000) {
        txStart = MILLIS();
        do {
            millis_ = MILLIS();
            not_timed_out = (millis_ - txStart) < RF69_TX_LIMIT_MS;
            debug_print ("task %d: testCnt(%d), millis(%d), txStart(%d), millis_-txStart(%d), timedOut(%d)\n",
                    task_num, testCnt, millis_, txStart, millis_ - txStart, !not_timed_out);
            delay_milliseconds(20);
        } while ((digitalRead() == 0) && not_timed_out);
        testCnt++;
    }
}

int main() {
    par {
        test_task(0);
        test_task(1);
    }
    return 0;
}
This is so interesting! I didn't think out of the language like with shared globals. But I do know I long'ed for long long 64 bits not to care about overflow, because that's where one of my attempts failed.
--
Øyvind Teig
Trondheim (Norway)
https://www.teigfam.net/oyvind/home/