/* * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #define pr_fmt(fmt) "bw-hwmon: " fmt #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "governor.h" #include "governor_bw_hwmon.h" #define NUM_MBPS_ZONES 10 struct hwmon_node { unsigned int guard_band_mbps; unsigned int decay_rate; unsigned int io_percent; unsigned int bw_step; unsigned int sample_ms; unsigned int up_scale; unsigned int up_thres; unsigned int down_thres; unsigned int down_count; unsigned int hist_memory; unsigned int hyst_trigger_count; unsigned int hyst_length; unsigned int idle_mbps; unsigned int low_power_ceil_mbps; unsigned int low_power_io_percent; unsigned int low_power_delay; unsigned int mbps_zones[NUM_MBPS_ZONES]; unsigned long prev_ab; unsigned long *dev_ab; unsigned long resume_freq; unsigned long resume_ab; unsigned long bytes; unsigned long max_mbps; unsigned long hist_max_mbps; unsigned long hist_mem; unsigned long hyst_peak; unsigned long hyst_mbps; unsigned long hyst_trig_win; unsigned long hyst_en; unsigned long above_low_power; unsigned long prev_req; unsigned long up_wake_mbps; unsigned long down_wake_mbps; unsigned int wake; unsigned int down_cnt; ktime_t prev_ts; ktime_t hist_max_ts; bool sampled; bool mon_started; struct list_head list; void *orig_data; struct bw_hwmon *hw; struct devfreq_governor *gov; struct attribute_group *attr_grp; }; #define UP_WAKE 1 #define DOWN_WAKE 2 static DEFINE_SPINLOCK(irq_lock); static LIST_HEAD(hwmon_list); static DEFINE_MUTEX(list_lock); static int use_cnt; static DEFINE_MUTEX(state_lock); #define show_attr(name) \ static ssize_t show_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct devfreq *df = to_devfreq(dev); \ struct hwmon_node *hw = df->data; \ return snprintf(buf, PAGE_SIZE, "%u\n", hw->name); \ } #define store_attr(name, _min, _max) \ static ssize_t store_##name(struct device *dev, \ struct device_attribute *attr, const char *buf, \ size_t count) \ { \ struct devfreq *df = to_devfreq(dev); \ struct hwmon_node *hw = df->data; \ int ret; \ unsigned int val; \ ret = sscanf(buf, "%u", &val); \ if (ret != 1) \ return -EINVAL; \ val = max(val, _min); \ val = min(val, _max); \ hw->name = val; \ return count; \ } #define gov_attr(__attr, min, max) \ show_attr(__attr) \ store_attr(__attr, min, max) \ static DEVICE_ATTR(__attr, 0644, show_##__attr, store_##__attr) #define show_list_attr(name, n) \ static ssize_t show_list_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct devfreq *df = to_devfreq(dev); \ struct hwmon_node *hw = df->data; \ unsigned int i, cnt = 0; \ \ for (i = 0; i < n && hw->name[i]; i++) \ cnt += snprintf(buf + cnt, PAGE_SIZE, "%u ", hw->name[i]);\ cnt += snprintf(buf + cnt, PAGE_SIZE, "\n"); \ return cnt; \ } #define store_list_attr(name, n, _min, _max) \ static ssize_t store_list_##name(struct device *dev, \ struct device_attribute *attr, const char *buf, \ size_t count) \ { \ struct devfreq *df = to_devfreq(dev); \ struct hwmon_node *hw = df->data; \ int ret; \ unsigned int i = 0, val; \ \ do { \ ret = sscanf(buf, "%u", &val); \ if (ret != 1) \ break; \ buf = strnchr(buf, PAGE_SIZE, ' '); \ if (buf) \ buf++; \ val = max(val, _min); \ val = min(val, _max); \ hw->name[i] = val; \ i++; \ } while (buf && i < n - 1); \ if (i < 1) \ return -EINVAL; \ hw->name[i] = 0; \ return count; \ } #define gov_list_attr(__attr, n, min, max) \ show_list_attr(__attr, n) \ store_list_attr(__attr, n, min, max) \ static DEVICE_ATTR(__attr, 0644, show_list_##__attr, store_list_##__attr) #define MIN_MS 10U #define MAX_MS 500U /* Returns MBps of read/writes for the sampling window. */ static unsigned int bytes_to_mbps(long long bytes, unsigned int us) { bytes *= USEC_PER_SEC; do_div(bytes, us); bytes = DIV_ROUND_UP_ULL(bytes, SZ_1M); return bytes; } static unsigned int mbps_to_bytes(unsigned long mbps, unsigned int ms) { mbps *= ms; mbps = DIV_ROUND_UP(mbps, MSEC_PER_SEC); mbps *= SZ_1M; return mbps; } static int __bw_hwmon_sample_end(struct bw_hwmon *hwmon) { struct devfreq *df; struct hwmon_node *node; ktime_t ts; unsigned long bytes, mbps; unsigned int us; int wake = 0; df = hwmon->df; node = df->data; ts = ktime_get(); us = ktime_to_us(ktime_sub(ts, node->prev_ts)); bytes = hwmon->get_bytes_and_clear(hwmon); bytes += node->bytes; node->bytes = 0; mbps = bytes_to_mbps(bytes, us); node->max_mbps = max(node->max_mbps, mbps); /* * If the measured bandwidth in a micro sample is greater than the * wake up threshold, it indicates an increase in load that's non * trivial. So, have the governor ignore historical idle time or low * bandwidth usage and do the bandwidth calculation based on just * this micro sample. */ if (mbps > node->up_wake_mbps) { wake = UP_WAKE; } else if (mbps < node->down_wake_mbps) { if (node->down_cnt) node->down_cnt--; if (node->down_cnt <= 0) wake = DOWN_WAKE; } node->prev_ts = ts; node->wake = wake; node->sampled = true; trace_bw_hwmon_meas(dev_name(df->dev.parent), mbps, us, wake); return wake; } int bw_hwmon_sample_end(struct bw_hwmon *hwmon) { unsigned long flags; int wake; spin_lock_irqsave(&irq_lock, flags); wake = __bw_hwmon_sample_end(hwmon); spin_unlock_irqrestore(&irq_lock, flags); return wake; } unsigned long to_mbps_zone(struct hwmon_node *node, unsigned long mbps) { int i; for (i = 0; i < NUM_MBPS_ZONES && node->mbps_zones[i]; i++) if (node->mbps_zones[i] >= mbps) return node->mbps_zones[i]; return node->hw->df->max_freq; } #define MIN_MBPS 500UL #define HIST_PEAK_TOL 60 static unsigned long get_bw_and_set_irq(struct hwmon_node *node, unsigned long *freq, unsigned long *ab) { unsigned long meas_mbps, thres, flags, req_mbps, adj_mbps; unsigned long meas_mbps_zone; unsigned long hist_lo_tol, hyst_lo_tol; struct bw_hwmon *hw = node->hw; unsigned int new_bw, io_percent; ktime_t ts; unsigned int ms; spin_lock_irqsave(&irq_lock, flags); ts = ktime_get(); ms = ktime_to_ms(ktime_sub(ts, node->prev_ts)); if (!node->sampled || ms >= node->sample_ms) __bw_hwmon_sample_end(node->hw); node->sampled = false; req_mbps = meas_mbps = node->max_mbps; node->max_mbps = 0; hist_lo_tol = (node->hist_max_mbps * HIST_PEAK_TOL) / 100; /* Remember historic peak in the past hist_mem decision windows. */ if (meas_mbps > node->hist_max_mbps || !node->hist_mem) { /* If new max or no history */ node->hist_max_mbps = meas_mbps; node->hist_mem = node->hist_memory; } else if (meas_mbps >= hist_lo_tol) { /* * If subsequent peaks come close (within tolerance) to but * less than the historic peak, then reset the history start, * but not the peak value. */ node->hist_mem = node->hist_memory; } else { /* Count down history expiration. */ if (node->hist_mem) node->hist_mem--; } /* Keep track of whether we are in low power mode consistently. */ if (meas_mbps > node->low_power_ceil_mbps) node->above_low_power = node->low_power_delay; if (node->above_low_power) node->above_low_power--; if (node->above_low_power) io_percent = node->io_percent; else io_percent = node->low_power_io_percent; /* * The AB value that corresponds to the lowest mbps zone greater than * or equal to the "frequency" the current measurement will pick. * This upper limit is useful for balancing out any prediction * mechanisms to be power friendly. */ meas_mbps_zone = (meas_mbps * 100) / io_percent; meas_mbps_zone = to_mbps_zone(node, meas_mbps_zone); meas_mbps_zone = (meas_mbps_zone * io_percent) / 100; meas_mbps_zone = max(meas_mbps, meas_mbps_zone); /* * If this is a wake up due to BW increase, vote much higher BW than * what we measure to stay ahead of increasing traffic and then set * it up to vote for measured BW if we see down_count short sample * windows of low traffic. */ if (node->wake == UP_WAKE) { req_mbps += ((meas_mbps - node->prev_req) * node->up_scale) / 100; /* * However if the measured load is less than the historic * peak, but the over request is higher than the historic * peak, then we could limit the over requesting to the * historic peak. */ if (req_mbps > node->hist_max_mbps && meas_mbps < node->hist_max_mbps) req_mbps = node->hist_max_mbps; req_mbps = min(req_mbps, meas_mbps_zone); } hyst_lo_tol = (node->hyst_mbps * HIST_PEAK_TOL) / 100; if (meas_mbps > node->hyst_mbps && meas_mbps > MIN_MBPS) { hyst_lo_tol = (meas_mbps * HIST_PEAK_TOL) / 100; node->hyst_peak = 0; node->hyst_trig_win = node->hyst_length; node->hyst_mbps = meas_mbps; } /* * Check node->max_mbps to avoid double counting peaks that cause * early termination of a window. */ if (meas_mbps >= hyst_lo_tol && meas_mbps > MIN_MBPS && !node->max_mbps) { node->hyst_peak++; if (node->hyst_peak >= node->hyst_trigger_count || node->hyst_en) node->hyst_en = node->hyst_length; } if (node->hyst_trig_win) node->hyst_trig_win--; if (node->hyst_en) node->hyst_en--; if (!node->hyst_trig_win && !node->hyst_en) { node->hyst_peak = 0; node->hyst_mbps = 0; } if (node->hyst_en) { if (meas_mbps > node->idle_mbps) req_mbps = max(req_mbps, node->hyst_mbps); } /* Stretch the short sample window size, if the traffic is too low */ if (meas_mbps < MIN_MBPS) { node->up_wake_mbps = (max(MIN_MBPS, req_mbps) * (100 + node->up_thres)) / 100; node->down_wake_mbps = 0; thres = mbps_to_bytes(max(MIN_MBPS, req_mbps / 2), node->sample_ms); } else { /* * Up wake vs down wake are intentionally a percentage of * req_mbps vs meas_mbps to make sure the over requesting * phase is handled properly. We only want to wake up and * reduce the vote based on the measured mbps being less than * the previous measurement that caused the "over request". */ node->up_wake_mbps = (req_mbps * (100 + node->up_thres)) / 100; node->down_wake_mbps = (meas_mbps * node->down_thres) / 100; thres = mbps_to_bytes(meas_mbps, node->sample_ms); } node->down_cnt = node->down_count; node->bytes = hw->set_thres(hw, thres); node->wake = 0; node->prev_req = req_mbps; spin_unlock_irqrestore(&irq_lock, flags); adj_mbps = req_mbps + node->guard_band_mbps; if (adj_mbps > node->prev_ab) { new_bw = adj_mbps; } else { new_bw = adj_mbps * node->decay_rate + node->prev_ab * (100 - node->decay_rate); new_bw /= 100; } node->prev_ab = new_bw; if (ab) *ab = roundup(new_bw, node->bw_step); *freq = (new_bw * 100) / io_percent; trace_bw_hwmon_update(dev_name(node->hw->df->dev.parent), new_bw, *freq, node->up_wake_mbps, node->down_wake_mbps); return req_mbps; } static struct hwmon_node *find_hwmon_node(struct devfreq *df) { struct hwmon_node *node, *found = NULL; mutex_lock(&list_lock); list_for_each_entry(node, &hwmon_list, list) if (node->hw->dev == df->dev.parent || node->hw->of_node == df->dev.parent->of_node || (!node->hw->dev && !node->hw->of_node && node->gov == df->governor)) { found = node; break; } mutex_unlock(&list_lock); return found; } int update_bw_hwmon(struct bw_hwmon *hwmon) { struct devfreq *df; struct hwmon_node *node; int ret; if (!hwmon) return -EINVAL; df = hwmon->df; if (!df) return -ENODEV; node = df->data; if (!node) return -ENODEV; if (!node->mon_started) return -EBUSY; dev_dbg(df->dev.parent, "Got update request\n"); devfreq_monitor_stop(df); mutex_lock(&df->lock); ret = update_devfreq(df); if (ret) dev_err(df->dev.parent, "Unable to update freq on request!\n"); mutex_unlock(&df->lock); devfreq_monitor_start(df); return 0; } static int start_monitor(struct devfreq *df, bool init) { struct hwmon_node *node = df->data; struct bw_hwmon *hw = node->hw; struct device *dev = df->dev.parent; unsigned long mbps; int ret; node->prev_ts = ktime_get(); if (init) { node->prev_ab = 0; node->resume_freq = 0; node->resume_ab = 0; mbps = (df->previous_freq * node->io_percent) / 100; ret = hw->start_hwmon(hw, mbps); } else { ret = hw->resume_hwmon(hw); } if (ret) { dev_err(dev, "Unable to start HW monitor! (%d)\n", ret); return ret; } if (init) devfreq_monitor_start(df); else devfreq_monitor_resume(df); node->mon_started = true; return 0; } static void stop_monitor(struct devfreq *df, bool init) { struct hwmon_node *node = df->data; struct bw_hwmon *hw = node->hw; node->mon_started = false; if (init) { devfreq_monitor_stop(df); hw->stop_hwmon(hw); } else { devfreq_monitor_suspend(df); hw->suspend_hwmon(hw); } } static int gov_start(struct devfreq *df) { int ret = 0; struct device *dev = df->dev.parent; struct hwmon_node *node; struct bw_hwmon *hw; struct devfreq_dev_status stat; node = find_hwmon_node(df); if (!node) { dev_err(dev, "Unable to find HW monitor!\n"); return -ENODEV; } hw = node->hw; stat.private_data = NULL; if (df->profile->get_dev_status) ret = df->profile->get_dev_status(df->dev.parent, &stat); if (ret || !stat.private_data) dev_warn(dev, "Device doesn't take AB votes!\n"); else node->dev_ab = stat.private_data; hw->df = df; node->orig_data = df->data; df->data = node; if (start_monitor(df, true)) goto err_start; ret = sysfs_create_group(&df->dev.kobj, node->attr_grp); if (ret) goto err_sysfs; return 0; err_sysfs: stop_monitor(df, true); err_start: df->data = node->orig_data; node->orig_data = NULL; hw->df = NULL; node->dev_ab = NULL; return ret; } static void gov_stop(struct devfreq *df) { struct hwmon_node *node = df->data; struct bw_hwmon *hw = node->hw; sysfs_remove_group(&df->dev.kobj, node->attr_grp); stop_monitor(df, true); df->data = node->orig_data; node->orig_data = NULL; hw->df = NULL; /* * Not all governors know about this additional extended device * configuration. To avoid leaving the extended configuration at a * stale state, set it to 0 and let the next governor take it from * there. */ if (node->dev_ab) *node->dev_ab = 0; node->dev_ab = NULL; } static int gov_suspend(struct devfreq *df) { struct hwmon_node *node = df->data; unsigned long resume_freq = df->previous_freq; unsigned long resume_ab = *node->dev_ab; if (!node->hw->suspend_hwmon) return -ENOSYS; if (node->resume_freq) { dev_warn(df->dev.parent, "Governor already suspended!\n"); return -EBUSY; } stop_monitor(df, false); mutex_lock(&df->lock); update_devfreq(df); mutex_unlock(&df->lock); node->resume_freq = resume_freq; node->resume_ab = resume_ab; return 0; } static int gov_resume(struct devfreq *df) { struct hwmon_node *node = df->data; if (!node->hw->resume_hwmon) return -ENOSYS; if (!node->resume_freq) { dev_warn(df->dev.parent, "Governor already resumed!\n"); return -EBUSY; } mutex_lock(&df->lock); update_devfreq(df); mutex_unlock(&df->lock); node->resume_freq = 0; node->resume_ab = 0; return start_monitor(df, false); } static int devfreq_bw_hwmon_get_freq(struct devfreq *df, unsigned long *freq, u32 *flag) { struct hwmon_node *node = df->data; /* Suspend/resume sequence */ if (!node->mon_started) { *freq = node->resume_freq; *node->dev_ab = node->resume_ab; return 0; } get_bw_and_set_irq(node, freq, node->dev_ab); return 0; } static ssize_t store_throttle_adj(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct devfreq *df = to_devfreq(dev); struct hwmon_node *node = df->data; int ret; unsigned int val; if (!node->hw->set_throttle_adj) return -ENOSYS; ret = kstrtouint(buf, 10, &val); if (ret) return ret; ret = node->hw->set_throttle_adj(node->hw, val); if (!ret) return count; else return ret; } static ssize_t show_throttle_adj(struct device *dev, struct device_attribute *attr, char *buf) { struct devfreq *df = to_devfreq(dev); struct hwmon_node *node = df->data; unsigned int val; if (!node->hw->get_throttle_adj) val = 0; else val = node->hw->get_throttle_adj(node->hw); return snprintf(buf, PAGE_SIZE, "%u\n", val); } static DEVICE_ATTR(throttle_adj, 0644, show_throttle_adj, store_throttle_adj); gov_attr(guard_band_mbps, 0U, 2000U); gov_attr(decay_rate, 0U, 100U); gov_attr(io_percent, 1U, 100U); gov_attr(bw_step, 50U, 1000U); gov_attr(sample_ms, 1U, 50U); gov_attr(up_scale, 0U, 500U); gov_attr(up_thres, 1U, 100U); gov_attr(down_thres, 0U, 90U); gov_attr(down_count, 0U, 90U); gov_attr(hist_memory, 0U, 90U); gov_attr(hyst_trigger_count, 0U, 90U); gov_attr(hyst_length, 0U, 90U); gov_attr(idle_mbps, 0U, 2000U); gov_attr(low_power_ceil_mbps, 0U, 2500U); gov_attr(low_power_io_percent, 1U, 100U); gov_attr(low_power_delay, 1U, 60U); gov_list_attr(mbps_zones, NUM_MBPS_ZONES, 0U, UINT_MAX); static struct attribute *dev_attr[] = { &dev_attr_guard_band_mbps.attr, &dev_attr_decay_rate.attr, &dev_attr_io_percent.attr, &dev_attr_bw_step.attr, &dev_attr_sample_ms.attr, &dev_attr_up_scale.attr, &dev_attr_up_thres.attr, &dev_attr_down_thres.attr, &dev_attr_down_count.attr, &dev_attr_hist_memory.attr, &dev_attr_hyst_trigger_count.attr, &dev_attr_hyst_length.attr, &dev_attr_idle_mbps.attr, &dev_attr_low_power_ceil_mbps.attr, &dev_attr_low_power_io_percent.attr, &dev_attr_low_power_delay.attr, &dev_attr_mbps_zones.attr, &dev_attr_throttle_adj.attr, NULL, }; static struct attribute_group dev_attr_group = { .name = "bw_hwmon", .attrs = dev_attr, }; static int devfreq_bw_hwmon_ev_handler(struct devfreq *df, unsigned int event, void *data) { int ret; unsigned int sample_ms; switch (event) { case DEVFREQ_GOV_START: sample_ms = df->profile->polling_ms; sample_ms = max(MIN_MS, sample_ms); sample_ms = min(MAX_MS, sample_ms); df->profile->polling_ms = sample_ms; ret = gov_start(df); if (ret) return ret; dev_dbg(df->dev.parent, "Enabled dev BW HW monitor governor\n"); break; case DEVFREQ_GOV_STOP: gov_stop(df); dev_dbg(df->dev.parent, "Disabled dev BW HW monitor governor\n"); break; case DEVFREQ_GOV_INTERVAL: sample_ms = *(unsigned int *)data; sample_ms = max(MIN_MS, sample_ms); sample_ms = min(MAX_MS, sample_ms); devfreq_interval_update(df, &sample_ms); break; case DEVFREQ_GOV_SUSPEND: ret = gov_suspend(df); if (ret) { dev_err(df->dev.parent, "Unable to suspend BW HW mon governor (%d)\n", ret); return ret; } dev_dbg(df->dev.parent, "Suspended BW HW mon governor\n"); break; case DEVFREQ_GOV_RESUME: ret = gov_resume(df); if (ret) { dev_err(df->dev.parent, "Unable to resume BW HW mon governor (%d)\n", ret); return ret; } dev_dbg(df->dev.parent, "Resumed BW HW mon governor\n"); break; } return 0; } static struct devfreq_governor devfreq_gov_bw_hwmon = { .name = "bw_hwmon", .get_target_freq = devfreq_bw_hwmon_get_freq, .event_handler = devfreq_bw_hwmon_ev_handler, }; int register_bw_hwmon(struct device *dev, struct bw_hwmon *hwmon) { int ret = 0; struct hwmon_node *node; struct attribute_group *attr_grp; if (!hwmon->gov && !hwmon->dev && !hwmon->of_node) return -EINVAL; node = devm_kzalloc(dev, sizeof(*node), GFP_KERNEL); if (!node) { dev_err(dev, "Unable to register gov. Out of memory!\n"); return -ENOMEM; } if (hwmon->gov) { attr_grp = devm_kzalloc(dev, sizeof(*attr_grp), GFP_KERNEL); if (!attr_grp) return -ENOMEM; hwmon->gov->get_target_freq = devfreq_bw_hwmon_get_freq; hwmon->gov->event_handler = devfreq_bw_hwmon_ev_handler; attr_grp->name = hwmon->gov->name; attr_grp->attrs = dev_attr; node->gov = hwmon->gov; node->attr_grp = attr_grp; } else { node->gov = &devfreq_gov_bw_hwmon; node->attr_grp = &dev_attr_group; } node->guard_band_mbps = 100; node->decay_rate = 90; node->io_percent = 16; node->low_power_ceil_mbps = 0; node->low_power_io_percent = 16; node->low_power_delay = 60; node->bw_step = 190; node->sample_ms = 50; node->up_scale = 0; node->up_thres = 10; node->down_thres = 0; node->down_count = 3; node->hist_memory = 0; node->hyst_trigger_count = 3; node->hyst_length = 0; node->idle_mbps = 400; node->mbps_zones[0] = 0; node->hw = hwmon; mutex_lock(&list_lock); list_add_tail(&node->list, &hwmon_list); mutex_unlock(&list_lock); if (hwmon->gov) { ret = devfreq_add_governor(hwmon->gov); } else { mutex_lock(&state_lock); if (!use_cnt) ret = devfreq_add_governor(&devfreq_gov_bw_hwmon); if (!ret) use_cnt++; mutex_unlock(&state_lock); } if (!ret) dev_info(dev, "BW HWmon governor registered.\n"); else dev_err(dev, "BW HWmon governor registration failed!\n"); return ret; } MODULE_DESCRIPTION("HW monitor based dev DDR bandwidth voting driver"); MODULE_LICENSE("GPL v2");