import random
import logging
import itertools
from datetime import datetime, time, date, timezone, timedelta
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytz
from IPython.utils import io
from IPython.display import display
from IPython.core.display import HTML

import aw_core
from aw_core.models import Event

import aw_research, aw_research.classify
from aw_transform import union_no_overlap

from quantifiedme.config import load_config
from quantifiedme.derived.screentime import load_screentime

# Use XKCD-style plots
# FIXME: Causes the day trend plots to take forever for some unknown reason
# matplotlib.pyplot.xkcd(scale=0.8, randomness=1)

# Set this to your timezone
your_timezone = pytz.timezone('Europe/Stockholm')
tz_offset = your_timezone.utcoffset(datetime.now())

# Use personal data, not fake data
personal = False

# Set to True to limit amount of data loaded (useful when developing/debugging)
fast = True

# Can be set to True to run faster on consequtive runs, but will not reflect changes in data,
# so needs to be set to False every now and then to clear the cache.
cache = True
    
# Days of history to use
days_back = 30 if fast else 5*365

%%javascript
document.title='QuantifiedMe - Jupyter'

# Now let's just set the current time and our query interval and we're ready to load data!
# If not running in personal mode, use a fixed datetime to make notebook reproducible
now = datetime.now(tz=timezone.utc) if personal else datetime(2021, 6, 9, tzinfo=timezone.utc)
day_offset = timedelta(hours=4)
today = datetime.combine(now.date(), time()).astimezone(timezone.utc) + day_offset
since = today - timedelta(days=days_back)

print(f"Today:  {today.date()}")
print(f"Start:  {since}")
print(f"End:    {now}")

Today:  2021-06-09
Start:  2021-05-10 04:00:00+00:00
End:    2021-06-09 00:00:00+00:00

import pickle

cachefile = Path("events.pickle")
cachefile_fast = Path("events_fast.pickle")
load_pickled = False
if cache and load_pickled and cachefile.exists() and cachefile.stat().st_mtime > datetime.now().timestamp() - 60*60*24:
    print("Loading cached events")
    with cachefile.open('rb') as f:
        events = pickle.load(f)
else:
    # Uses my syncing testing instance to get multi-device data
    events = load_screentime(since, datasources=None, hostnames=None, personal=personal, cache=cache)
    # Save the events to a file so we can load them faster next time
    with cachefile.open('wb') as f:
        pickle.dump(events, f)
    # smaller version
    with cachefile_fast.open('wb') as f:
        pickle.dump(events[:1000], f)

print(f"First: {events[0].timestamp}")
print(f"Last:  {events[-1].timestamp}")

[WARNING] Found 198 events overlapping, totalling: 0:00:00.101261
[WARNING] Found 198 events overlapping, totalling: 0:00:00.101261
Query start: 2021-05-10 04:00:00+00:00
Events start: 2024-06-11 08:18:14.743000+00:00
[WARNING] Found 198 events overlapping, totalling: 0:00:00.101261
First: 2024-06-11 08:18:14.743000+00:00
Last:  2024-06-25 15:44:19.197000+00:00

# Inspect the distribution of event duration
fig, ax = plt.subplots()
xlim = 50
pd.Series([e.duration.total_seconds() for e in events if e.duration.total_seconds() <= xlim]).plot.hist(bins=10, bottom=1)
ax.set_xlabel('Seconds')
ax.set_ylabel('# of events')
ax.set_xlim(0, xlim)
#ax.set_yscale('log')

#df = pd.DataFrame(pd.Series([e.duration.total_seconds() for e in events]))
#df["dur"] = (df[0] // 10) * 10
#df["logdur"] = log((df[0] * 1).round())
#df[df["dur"] > 10]["dur"].plot.hist()
#df.groupby("dur").mean() * df.groupby("dur").count()

(0.0, 50.0)

# print the longest event
longest = sorted(events, key=lambda e: e.duration, reverse=True)[0]
print(f"Longest duration event was {longest.duration}")

Longest duration event was 1:59:07.334053

fig, ax = plt.subplots()
xlim = 5
pd.Series([e.duration.total_seconds() for e in events if e.duration.total_seconds() <= xlim]).plot.hist(bins=10, bottom=1)
ax.set_xlabel('Seconds')
ax.set_ylabel('# of events')
ax.set_xlim(0, xlim)

(0.0, 5.0)

xlim = 2
pd.Series([e.duration.total_seconds() for e in events if e.duration.total_seconds() <= xlim]).plot.hist(bins=10, bottom=1)
ax.set_xlabel('Seconds')
ax.set_ylabel('# of events')
ax.set_xlim(0, xlim)

(0.0, 2.0)

total_events = len(events)
short_thres = 5
short_events = len([e for e in events if e.duration.total_seconds() < short_thres])
print(f"# of total events:  {total_events}")
print(f"# of events <{short_thres}s:    {short_events} ({round(100 * short_events/total_events)}%)")

# of total events:  313
# of events <5s:    8 (3%)

# TODO: Include sleep for improved coverage
tracking_cov = __builtins__.sum((e.duration for e in events), timedelta()) / (now - since)
print(f"Tracking coverage: {100 * tracking_cov:.3}%")

Tracking coverage: 10.5%

time_by_source = defaultdict(float)
df_rows = []
for e in events:
    date = e.timestamp.date()
    source = e.data.get('$source', 'unknown')
    host = e.data.get('$hostname', 'unknown')
    duration = e.duration.total_seconds()
    time_by_source[host] += duration
    df_rows.append([date, source, host, duration])
    
def line_format(label):
    """
    Convert time label to the format of pandas line plot
    From: https://stackoverflow.com/a/53995225/965332
    """
    month = label.month_name()[:3]
    if month == 'Jan':
        month += f'\n{label.year}'
    return month
    
    
df = pd.DataFrame(df_rows, columns=["date", "source", "host", "duration"])
df = df.set_index(["date", "source", "host"])
df = df.groupby(level=[0,1,2]).sum()
df /= 60*60
df = df.unstack(level=2).fillna(0).reset_index().set_index('date')

df = df.drop(columns='source')
df = df.groupby(level=0).sum()

timeline = pd.to_datetime(pd.date_range(start=df.index[0], end=df.index[-1], freq='D'))
timeline_months = [date for date in timeline if date.day == 1]

# Add missing dates
df = df.reindex(pd.to_datetime(timeline), fill_value=0)

# Drop 'duration' level into individual hostname columns
df.columns = df.columns.droplevel(0)
assert len(df[df.index.duplicated()]) == 0

import matplotlib.dates as mdates
ax = df.plot.bar(stacked=True, figsize=(30, 4), ylim=(0, None), rot=0)

ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.set_xticklabels(map(line_format, timeline_months));

#df

/tmp/ipykernel_2482/919551138.py:28: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  df = df.drop(columns='source')

df = pd.DataFrame({"source": time_by_source.keys(), "duration": time_by_source.values()})
df['days'] = df['duration'] / (60*60*24)
df.plot.bar(x='source', y='days', rot=30)
df

from aw_research import split_event_on_hour, categorytime_per_day, categorytime_during_day, start_of_day, end_of_day
    
def plot_categorytime_during_day(events, category, color='teal'):
    df = categorytime_during_day(events, category, today)
    
    # FIXME: This will make the first and last hour to always be 0
    ix = pd.date_range(start=start_of_day(today) + day_offset - tz_offset,
                         end=start_of_day(today) + timedelta(hours=24) + day_offset - tz_offset,
                         freq='H')
    df = df.reindex(ix)
    df = df.sort_index().asfreq('H')
    
    fig = plt.figure(figsize=(18, 3))
    ax = df.plot(kind='bar', color=color, rot=60)
    ax.set_ylim(0, 1)
    plt.title(category or "All activity")
    
    def label_format_hour(label):
        """
        Convert time label to the format of pandas line plot
        Based on: https://stackoverflow.com/a/53995225/965332
        """
        label = label.replace(tzinfo=your_timezone)
        label = label + label.utcoffset()
        return f"{label.hour}:{str(label.minute).ljust(2, '0')}"  # if label.hour % 2 == 0 else ''
        
    ax.set_xticklabels([label_format_hour(dt) for dt in df.index])
    plt.tight_layout()

plot_categorytime_during_day(events, "")
plot_categorytime_during_day(events, "Work", color='green')

/home/runner/.cache/pypoetry/virtualenvs/quantifiedme-5kvM-1hc-py3.10/lib/python3.10/site-packages/aw_research/util.py:248: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  return ts.resample("1H").apply("sum")
/tmp/ipykernel_2482/3308955599.py:7: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  ix = pd.date_range(start=start_of_day(today) + day_offset - tz_offset,
/tmp/ipykernel_2482/3308955599.py:11: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  df = df.sort_index().asfreq('H')
/home/runner/.cache/pypoetry/virtualenvs/quantifiedme-5kvM-1hc-py3.10/lib/python3.10/site-packages/aw_research/util.py:248: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  return ts.resample("1H").apply("sum")
/tmp/ipykernel_2482/3308955599.py:7: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  ix = pd.date_range(start=start_of_day(today) + day_offset - tz_offset,
/tmp/ipykernel_2482/3308955599.py:11: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  df = df.sort_index().asfreq('H')

from quantifiedme.timelineplot.plot import TimelineFigure
import matplotlib

fig = TimelineFigure(title="Timeline")

# TODO: plot timeline with bars colored according to: 
#       - [ ] device used
#       - [x] category 
#            - [ ] would be better with category colors configured

def dayevents_to_barsegments(events, colorkey='$hostname', cmap=matplotlib.colormaps['tab10']):
    # useful colorkey values: app, $hostname, $category_hierarchy
    groups = {}
    for e in events:
        if e.duration.total_seconds() < 5:
           continue
        t1 = e.timestamp.time()
        t2 = (e.timestamp + e.duration).time()
        if t1 > t2:
            # event crossing midnight, skip
            continue
        start = t1.hour * 60 * 60 + t1.minute * 60 + t1.second
        stop  = t2.hour * 60 * 60 + t2.minute * 60 + t2.second
        key   = e.data[colorkey]
        if key not in groups:
            groups[key] = len(groups)
        color = cmap(groups[key]) 
        yield ((start, stop), color, '')
    

# split events into days, days into bar segments
for date, dayevents in itertools.groupby(sorted(events, key=lambda e: e.timestamp), lambda e: e.timestamp.date()):
    dayevents = list(dayevents)
    second_of_day = e.timestamp.time()
    bar = [segment for segment in dayevents_to_barsegments(dayevents)]
    if bar:
        fig.add_bar(bar, title=str(date))
    
ticks = list(range(0, 24 * 60 * 60, 60 * 60))
labels = [str(t // (60*60)) for t in ticks] 
fig.ax.set_xticks(ticks, labels)
fig.ax.grid(True, axis="x", linestyle='--', linewidth=1, zorder=-1000, alpha=0.4)
fig.plot()
# Needs extra work with matplotlib "artists" (never used before)
#fig.ax.legend()

def plot_category(cat, big=False, barcolor=(0.2, 0.4, 0.8, 0.5)):
    #aw_research.classify._plot_category_daily_trend(events, [cat])
    try:
        ts  = categorytime_per_day(events, cat)
    except Exception as e:
        print(f"Error for category '{cat}': {e}")
        return
    fig, ax = plt.subplots(figsize=(24, 5 if big else 3))
    ax.bar(ts.index, ts, label=f"{cat}: daily", color=barcolor)
    ax.plot(ts.index, ts.rolling(7, min_periods=4).mean(), label=f"7d SMA")
    ax.plot(ts.index, ts.rolling(30, min_periods=14).mean(), label=f"30d SMA")
    ax.plot(ts.index, ts.rolling(60, min_periods=30).mean(), label=f"60d SMA")
    plt.legend(loc='upper left')
    plt.title(cat)
    plt.xlim(pd.Timestamp(since), pd.Timestamp(now))
    plt.ylim(0)
    plt.grid(linestyle='--')
    plt.tight_layout()
    
color_prod = (0.1, 0.8, 0.1, 0.8)
color_unprod = (0.8, 0.1, 0.1, 0.8)

# All logged activity
plot_category('', big=True)

# Work-related
plot_category('Work', big=True, barcolor=color_prod)
plot_category('Programming')
plot_category('ActivityWatch')
plot_category('QuantifiedMe')
plot_category('Thankful')
plot_category('Algobit')
plot_category('uniswap-python')

Error for category 'QuantifiedMe': No events to calculate on
Error for category 'Thankful': No events to calculate on
Error for category 'Algobit': No events to calculate on
Error for category 'uniswap-python': No events to calculate on

# School-related
plot_category('School')
plot_category('Self-directed')
plot_category('Maths')

Error for category 'School': No events to calculate on
Error for category 'Self-directed': No events to calculate on
Error for category 'Maths': No events to calculate on

# Entertainment
plot_category('Media', big=True, barcolor=color_unprod)
plot_category('Social Media')
plot_category('Video')
plot_category('Music')
plot_category('Games')

Error for category 'Video': No events to calculate on
Error for category 'Games': No events to calculate on

# All uncategorized time
plot_category('Uncategorized')

events_today = [e for e in events if today < e.timestamp]

def plot_sunburst(events):
    plt.figure(figsize=(6, 6))
    aw_research.classify._plot_category_hierarchy_sunburst(events)
    display(HTML(f"<b>Total duration: {__builtin__.sum((e.duration for e in events), timedelta(0))}</b>"))

plot_sunburst(events_today)

plot_sunburst([e for e in events if today - timedelta(days=30) < e.timestamp])

plot_sunburst(events)

# NOTE: Setting a rate for a subcategory will add to the rate for the parent category, if any
category_wages = {
    "Work": 30,  # Base rate
    "ActivityWatch": 15,  # In addition to the base rate
    "QuantifiedMe": 40,   # Self-analyzing my behavior probably has high returns
    "Thankful": 40,
    "School": 60,
    "Algobit": 100,
    "Finance": 100,
}

def plot_wages(events, category_wages):
    fig, ax1 = plt.subplots()
    
    df = pd.DataFrame()
    for cat, wage in category_wages.items():
        try:
            df[cat] = wage * categorytime_per_day(events, cat)
        except Exception as e:
            print(f"Exception for category {cat}: {e}")
    df.plot.area(label='total', stacked=True, figsize=(16, 4), ax=ax1, legend=False)
    ax1.set_ylabel("Daily wage ($)")
    ax1.legend(loc=1)
    
    #ax2 = ax1.twinx()
    #df.sum(axis=1).rolling(7).mean().plot(label='Total 7d SMA', legend=False, ax=ax2)
    df.sum(axis=1).rolling(30).mean().plot(label='Total 30d SMA', legend=False, ax=ax1)
    #ax2.set_ylabel("Daily wage ($)")
    #ax2.legend(loc=2)
    plt.xlim(pd.Timestamp(since), pd.Timestamp(now))
    plt.grid(linestyle='-.')
    plt.tight_layout()
    
plot_wages(events, category_wages)

Exception for category QuantifiedMe: No events to calculate on
Exception for category Thankful: No events to calculate on
Exception for category School: No events to calculate on
Exception for category Algobit: No events to calculate on
Exception for category Finance: No events to calculate on

def time_per_keyval(events, key):
    vals = defaultdict(lambda: timedelta(0))
    for e in events:
        if key in e.data:
            vals[e.data[key]] += e.duration
        else:
            vals[f'key {key} did not exist'] += e.duration
    return vals

def print_time_per_keyval(events, key, limit=20, sortby='duration'):
    from tabulate import tabulate
    if sortby == 'duration':
        l = sorted([(v, k) for k, v in time_per_keyval(events, key).items()], reverse=True)
    elif sortby == 'key':
        l = sorted([(k, v) for k, v in time_per_keyval(events, key).items()], reverse=True)
    else:
        raise ValueError(f'invalid option for sortby, was "{sortby}"')
    print(tabulate(l[:limit], headers=['time', 'val']))
    
events_uncategorized = [e for e in events if 'Uncategorized' in e.data['$tags']]
print_time_per_keyval(events_uncategorized, 'title', limit=10)

time            val
--------------  --------------------------------------
3:17:55.342974  Zoom Meeting
3:09:38.417148  Stack Overflow - stackoverflow.com/
3:01:50.228072  Minecraft
1:52:01.286888  Gmail - mail.google.com/
1:20:37.553210  Unknown site
0:12:41.101441  Google Calendar - calendar.google.com/

events_uncategorized_today = [e for e in events_uncategorized if e.timestamp > today]
print_time_per_keyval(events_uncategorized_today, 'title')

time            val
--------------  --------------------------------------
3:17:55.342974  Zoom Meeting
3:09:38.417148  Stack Overflow - stackoverflow.com/
3:01:50.228072  Minecraft
1:52:01.286888  Gmail - mail.google.com/
1:20:37.553210  Unknown site
0:12:41.101441  Google Calendar - calendar.google.com/

events_programming = [e for e in events if 'Work -> Programming' == e.data['$category_hierarchy']]
print_time_per_keyval(events_programming, 'title')

time    val
------  -----

#print_time_per_keyval(events, "$category_hierarchy", limit=100, sortby='key')
#tabulate(set(e.data["$category_hierarchy"] for e in events))

print_time_per_keyval(events, '$source')

time                    val
----------------------  -------------
3 days, 2:50:12.947592  activitywatch

from aw_research.tree import Node

def build_tree(data: dict):
    root = Node('root', timedelta())
    for k, v in sorted(data.items()):
        hier = k.split(" -> ")
        parent = root
        # Get (or create) parent
        for level in hier[:-1]:
            if not level in parent:
                # Create parent node
                parent += Node(level, timedelta())
            parent = parent[level]
        # Create child node
        level = hier[-1]
        if level not in parent:
            parent += Node(level, v)
        else:
            print("Unexpected!")
    return root
        
root = build_tree(time_per_keyval(events, '$category_hierarchy'))
print(root.print())

root:                 3 days, 2:50:12.947592  (0:00:00)
    Work:             1 day, 20:09:10.571133  (0:00:00)
        Programming:  1 day, 20:09:10.571133  (0:00:00)
            ActivityWatch:  1 day, 20:09:10.571133  
    Uncategorized:    12:54:43.929733  
    Social Media:     12:19:39.764770  (0:00:00)
        Twitter:      5:24:48.938004  
        Reddit:       4:28:30.944409  
        Facebook:     2:26:19.882357  
    Media:            5:26:38.681956  (0:00:00)
        YouTube:      4:13:13.156699  
        Music:        1:13:25.525257

QuantifiedMe¶

Table of contents¶

Introduction¶

Setup¶

Configuration¶

Set current time¶

Load data¶

Inspect data¶

How much time is covered?¶

Which devices are used the most?¶

Visualize¶

Today plot¶

Timeline¶

Trends plot¶

Category sunburst¶

Fictional wage plot¶

Category checks¶

Uncategorized¶

Underspecified categories¶

Category tree¶

Closing remarks¶

Run it yourself!¶

Other interesting links¶

Thanks to¶

TODO: Post to¶

QuantifiedMe¶

Table of contents¶

Introduction¶

Setup¶

Configuration¶

Set current time¶

Load data¶

Inspect data¶

How much time is covered?¶

Which devices are used the most?¶

Visualize¶

Today plot¶

Timeline¶

Trends plot¶

Category sunburst¶

Fictional wage plot¶

Category checks¶

Uncategorized¶

Underspecified categories¶

Category tree¶

Closing remarks¶

Share & donate!¶

Run it yourself!¶

Other interesting links¶

Thanks to¶

TODO: Post to¶