#!/usr/bin/env perl ############################################################################# # This is a portable attempt at strtotime() with the goal of being 95% # compatible with Date::Parse::str2time(). The entire thing should be one # function that can be pasted in to your code. Time::Local will be required # but it's a core module so you already have it. # # The implementation is done entirely with regexps and pattern matching. # We try to guess which parts of a given string are the date and which are # the time. In the cases where there pieces are missing we attempt to use # sane defaults # # See bottom for benchmark information. For most cases, we're faster than # Date::Parse which is surprising. # # Scott: 2023-01-06 ############################################################################# use strict; use warnings; use v5.16; use Getopt::Long; use Date::Parse qw(str2time); use Benchmark; my $debug; my $benchmark; my $string; GetOptions( 'debug' => \$debug, 'benchmark' => \$benchmark, 'string=s' => \$string, ); my $filter = $ARGV[0] || ""; ############################################################################### ############################################################################### my @times = ( "1979-02-24", "1979-10-06", "1979/04/16", "Sat May 8 21:24:31 2021", "2000-02-29T12:34:56", "May 4 01:04:16", "1995-01-24T09:08:17.1823213", "Thu, 13 Oct 94 10:13:13 +0700", "16 Nov 94 22:28:20 +1000", "Thu, 13 Oct 94 10:13:13 +0200", "January 5 2023 12:53 am", "January 9 2019 12:53 pm", "Mon May 10 11:09:36 MDT 2021", "Wed, 16 Jun 94 07:29:35 CST", "16 Nov 94 22:28:20 PST", 'Mon, 14 Nov 1994 11:34:32 -0500 (EST)', 'Jul 22 10:00:00 UTC 2002', 'Wed, 9 Nov 94 16:31:52 PST', 'Wed, 09 Nov 94 10:41:10 -0800', '21/dec/93 17:05', 'dec/21/93 17:05', 'Dec/21/1993 17:05:00', '10:00:00', '1994-11-05T13:15:30Z', '12-24-1999', '02-24-1979', #'21/dec 17:05', ); if ($string) { @times = ($string); } if ($filter) { @times = grep { /$filter/; } @times; } if ($benchmark) { my $num = 10000; print "Comparing " . scalar(@times) . " strings\n"; cmpthese($num, { 'Scott' => sub { foreach(@times) { my $x = strtotime($_); }; }, 'Date::Parse' => sub { foreach(@times) { my $x = Date::Parse::str2time($_); }; }, }); exit; } printf("%38s = %14s = %14s\n", "Input String", "Date::Parse", "Scott"); foreach (@times) { my $x = Date::Parse::str2time($_) // 0; my $y = strtotime($_, $debug) // 0; if ($x != $y) { print color('orange'); my $diff = $y - $x; my $diff_str = ''; if ($diff >= (3600 * 24 * 365)) { $diff_str = ($diff / (3600 * 24 * 365)) . " years"; } elsif ($diff >= (3600 * 24 * 30)) { $diff_str = ($diff / (3600 * 24 * 30)) . " months"; } elsif ($diff >= (3600 * 24)) { $diff_str = ($diff / (3600 * 24)) . " days"; } elsif ($diff >= (3600)) { $diff_str = $diff / 3600 . " hours"; } else { $diff_str = $diff / 60 . " minutes"; } printf("%38s = %14.3f = %14.3f (diff: %s)\n", $_, $x, $y, $diff_str); print color(); } else { printf("%38s = %14.3f = %14.3f\n", $_, $x, $y); } } ############################################################################### ############################################################################### sub benchmark_individual { foreach my $str (@times) { my $num = 50000; print "Comparing '$str'\n"; cmpthese($num, { 'Scott' => sub { my $x = strtotime($str); }, 'Date::Parse' => sub { my $x = Date::Parse::str2time($str); }, }); print "\n"; } exit; } sub benchmark2 { my $str = $_[0]; use Benchmark qw(cmpthese); $str ||= "16 Nov 94 22:28:20 +1000"; #$str ||= "May 4 01:04:16"; print "Comparing: '$str'\n"; cmpthese(100000, { 'Date::Parse' => sub { Date::Parse::str2time($str) }, 'Scott' => sub { main::strtotime($str) }, }); exit; } # The logic here is that we use regular expressions to pull out various patterns # YYYY/MM/DD, H:I:S, DD MonthWord YYYY sub strtotime { my ($str, $debug) = @_; # If we use state the variables doesn't get instantiated EVERY time and it's much faster # https://timezonedb.com/download CORE::state $TZ_OFFSET = { 'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4, 'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5, 'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4, 'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8, 'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9, 'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5, 'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6, 'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5, 'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9, 'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8, 'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9, 'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7, 'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9, 'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4, 'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7, 'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6, 'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8, 'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8, 'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4, 'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5, 'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9, 'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6, 'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12, 'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9, 'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0, }; CORE::state $mons = { 'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 , 'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12, }; CORE::state $month_regexp = qr/Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December/i; # Separator between dates pieces: '-' or '/' or '\' CORE::state $sep = qr/[\/\\-]/; my ($year, $month, $day) = (0,0,0); my ($hour, $min, $sec, $ms) = (0,0,0,0); #################################################################################################### #################################################################################################### # First we look to see if we have anything that mathches YYYY-MM-DD (numerically) if ($str =~ m/\b((\d{4})$sep(\d{2})$sep(\d{2})|(\d{2})$sep(\d{2})$sep(\d{4}))/) { # YYYY-MM-DD: 1999-12-24 if ($2 || $3) { $year = $2; $month = $3; $day = $4; } # DD-MM-YYYY: 12-24-1999 if ($5 || $6) { $day = $5; $month = $6; $year = $7; # It might be American format (MM-DD-YYYY) so we do a quick flip/flop if ($month > 12) { ($day, $month) = ($month, $day); } } } # The year may be on the end of the string like: Sat May 8 21:24:31 2021 if (!$year) { ($year) = $str =~ m/\s(\d{4})\b/; } #################################################################################################### # Next we look for alpha months followed by a digit if we didn't find a numeric month above # This will find: "April 13" and also "13 April 1995" if (!$month && $str =~ m/(\d{1,2})?\s*($month_regexp)\s+(\d{1,4})/) { # Get the numerical number for this month my $month_name = lc(substr($2,0,3)); $month = $mons->{$month_name}; # 17 March 94 if ($1) { $day = int($1); $year = int($3); } else { $day = int($3); } } #################################################################################################### # Alternate date string like like: 21/dec/93 or dec/21/93 (much less common) not sure if it's worth supporting this) if (!$month && $str =~ /(.*)($month_regexp)(.*)/) { my $before = $1; my $after = $3; $month = $mons->{lc($2)}; # Month starts string: dec/21/93 if ($before eq "") { $after =~ m/(\d{2})$sep(\d{2,4})/; $day = $1; $year = $2; } elsif ($before && $after) { $before =~ s/(\d+)\D/$1/g; $after =~ s/\D(\d{2,4}).*/$1/g; $day = $before; $year = $after; } } #################################################################################################### # Now we look for times: 10:14, 10:14:17, 08:15pm if ($str =~ m/(\b|T)(\d{1,2}):(\d{1,2}):?(\d{2}(Z|\.\d+)?)?( ?am|pm|AM|PM)?\b/) { $hour = int($2); $min = int($3); $sec = $4 || 0; # Not int() cuz it might be float for milliseconds $sec =~ s/Z$//; my $ampm = lc($6 || ""); # PM means add 12 hours if ($ampm eq "pm") { $hour += 12; } # 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate if ($ampm && ($hour == 24 || $hour == 12)) { $hour -= 12; } } my $has_time = ($hour || $min || $sec); my $has_date = ($year || $month || $day); if (!$has_time && !$has_date) { return undef; } #################################################################################################### #################################################################################################### # Sanity check some basic boundaries if ($month > 12 || $day > 31 || $hour > 23 || $min > 60 || $sec > 61) { return undef; } $month ||= (localtime())[4] + 1; # If there is no month, we assume the current month $day ||= (localtime())[3]; # If there is no day, we assume the current day # If we STILL don't have a year it may be a time only string so we assume it's the current year $year ||= (localtime())[5] + 1900; # Convert any two digit years to four digits if ($year < 100) { $year += 1900; } if ($debug) { printf("%38s = %02d/%02d/%02d %02d:%02d:%02d\n", $str, $year || -1, $month || -1, $day || -1, $hour, $min, $sec); } # If we have all the requisite pieces we build a unixtime my $ret; eval { $ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year); }; # If we find a timezone offset we take that in to account now # Either: +1000 or -0700 # or # 11:53 PST (Three or four chars after a time) if ($ret && $str =~ m/(\s([+-])(\d{2})(\d{2})|:\d{2} ([a-zA-Z]{3,4})\b|\d{2}(Z)$)/) { my $str_offset = 0; if ($5 || $6) { my $tz_code = $5 || $6 || ''; # Timezone offsets are in hours, so we convert to seconds $str_offset = $TZ_OFFSET ->{$tz_code} || 0; $str_offset *= 3600; #k("$tz_code = $str_offset"); } else { # Break the input string into parts so we can do math $str_offset = ($3 + ($4 / 60)) * 3600; if ($2 eq "-") { $str_offset *= -1; } } $ret -= $str_offset; # No timezone to account for so we assume the local timezone } elsif ($ret) { # We get the local timezone by creating local time obj and a UTC time obj # and comparing the two my @t = localtime($ret); my $local_offset = (Time::Local::timegm(@t) - Time::Local::timelocal(@t)); $ret -= $local_offset; } return $ret; } sub trim { my $s = shift(); if (!defined($s) || length($s) == 0) { return ""; } $s =~ s/^\s*//; $s =~ s/\s*$//; return $s; } # String format: '115', '165_bold', '10_on_140', 'reset', 'on_173', 'red', 'white_on_blue' sub color { my $str = shift(); # If we're NOT connected to a an interactive terminal don't do color if (-t STDOUT == 0) { return ''; } # No string sent in, so we just reset if (!length($str) || $str eq 'reset') { return "\e[0m"; } # Some predefined colors my %color_map = qw(red 160 blue 27 green 34 yellow 226 orange 214 purple 93 white 15 black 0); $str =~ s|([A-Za-z]+)|$color_map{$1} // $1|eg; # Get foreground/background and any commands my ($fc,$cmd) = $str =~ /^(\d{1,3})?_?(\w+)?$/g; my ($bc) = $str =~ /on_(\d{1,3})$/g; # Some predefined commands my %cmd_map = qw(bold 1 italic 3 underline 4 blink 5 inverse 7); my $cmd_num = $cmd_map{$cmd // 0}; my $ret = ''; if ($cmd_num) { $ret .= "\e[${cmd_num}m"; } if (defined($fc)) { $ret .= "\e[38;5;${fc}m"; } if (defined($bc)) { $ret .= "\e[48;5;${bc}m"; } return $ret; } # Debug print variable using either Data::Dump::Color (preferred) or Data::Dumper # Creates methods k() and kd() to print, and print & die respectively BEGIN { if (eval { require Data::Dump::Color }) { *k = sub { Data::Dump::Color::dd(@_) }; } else { require Data::Dumper; *k = sub { print Data::Dumper::Dumper(\@_) }; } sub kd { k(@_); printf("Died at %2\$s line #%3\$s\n",caller()); exit(15); } } __END__ Performance varies depending on string input Running the entire test suite through both strtotime() (mine) and Date::Parse::str2time() via --bench gets the following output: $ perl baker-strtotime.pl --bench Comparing 24 strings Rate Date::Parse Scott Date::Parse 1562/s -- -31% Scott 2257/s 44% --