admin管理员组

文章数量:1337123

QCM6490 SSR 记述(一)

项目场景:

modem 子系统crash导致系统crash,偶现。且SSR已经关闭。

如果disable_restart_work 设置为DISABLE_SSR,那么不管什么(wlan adsp-audio/sensor modem etc)触发了SSR,都不会重启

#define DISABLE_SSR 0x9889deed
/* If set to 0x9889deed, call to subsystem_restart_dev() returns immediately */
//static uint disable_restart_work;
static uint disable_restart_work = DISABLE_SSR;

常用的打开log:

adb shell "echo 'file subsystem_restart.c +p' > /sys/kernel/debug/dynamic_debug/control"  
adb shell "echo 'file subsys-pil-tz.c +p' > /sys/kernel/debug/dynamic_debug/control"  

问题描述

1.有以下几种情况会导致系统重启/crash:

//subsystem_restart_dev
//01.If a system reboot/shutdown is underway ignore subsystem errors.
//However, print a message so that we know that a subsystem behaved unexpectedly here.
extern enum system_states {SYSTEM_BOOTING,SYSTEM_SCHEDULING,SYSTEM_RUNNING,SYSTEM_HALT,SYSTEM_POWER_OFF,SYSTEM_RESTART,SYSTEM_SUSPEND,} system_state;if (system_state == SYSTEM_RESTART|| system_state == SYSTEM_POWER_OFF) {pr_err("%s crashed during a system poweroff/shutdown.\n", name);return -EBUSY;
}//02.disable_restart_work = DISABLE_SSR;直接跳过if (disable_restart_work == DISABLE_SSR) {pr_err("subsys-restart: Ignoring restart request for %s\n",name);return 0;}//03.restart_levelswitch (dev->restart_level) {case RESET_SUBSYS_COUPLED://related 已经确认是这里__subsystem_restart_dev(dev);break;case RESET_SOC://system__pm_stay_awake(dev->ssr_wlock);schedule_work(&dev->device_restart_work);return 0;default:panic("subsys-restart: Unknown restart level!\n");break;}
//__subsystem_restart_dev 
//04.正常的情况下,应该是track->p_state为SUBSYS_NORMAL;dev->track.state为SUBSYS_ONLINE;否则系统重启if (track->p_state != SUBSYS_CRASHED &&dev->track.state == SUBSYS_ONLINE) {if (track->p_state != SUBSYS_RESTARTING) {track->p_state = SUBSYS_CRASHED;__pm_stay_awake(dev->ssr_wlock);queue_work(ssr_wq, &dev->work);//触发子系统重启} else {pr_err("Subsystem %s crashed during SSR!", name);}} elseWARN(dev->track.state == SUBSYS_OFFLINE,"SSR aborted: %s subsystem not online\n", name);//	INIT_WORK(&subsys->work, subsystem_restart_wq_func);
//05.再次检测系统状态,系统关机重启abort SSR if (system_state == SYSTEM_RESTART|| system_state == SYSTEM_POWER_OFF) {WARN(1, "SSR aborted: %s, system reboot/shutdown is under way\n",desc->name);pr_err("SSR aborted: %s, system reboot/shutdown is under way\n",desc->name);return;}
//06.子系统没有起来,abort SSRif (dev->track.state == SUBSYS_OFFLINE) {mutex_unlock(&track->lock);WARN(1, "SSR aborted: %s subsystem not online\n", desc->name);pr_err("SSR aborted: %s subsystem not online\n",desc->name);return;}

2.首先肯定是wlan adsp(audio sensor) modem 子系统异常触发中断或者直接进入下面的函数:
subsystem_restart_dev

3.内核中有许多地方调用类似BUG()的语句,它非常像一个内核运行时的断言,意味着本来不该执行到BUG()这条语句,一旦执行即抛出Oops。 BUG()的定义为:

#define BUG() do { \printk("BUG at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \panic("BUG!"); \} while (0)

BUG()还有一个变体叫BUG_ON(),它的内部会引用BUG()

#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)

其中的panic()定义在kernel/panic.c中,会导致内核崩溃,并打印Oops。
内核有个稍微弱一些WARN_ON(),在括号中的条件成立时,内核会打印栈回溯,但是不会panic(),表示内核抛出一个警告,暗示某种不太合理的事情发生了。

4.CONFIG_SETUP_SSR_NOTIF_TIMEOUTS 这个宏控可以关闭,没有什么影响


原因分析:

目前出现一个问题,modem子系统重启偶现不生效;还是panic.

我这边加了个延时,手动触发modem crash 可以模拟出来“Subsystem modem crashed during SSR!”:是是因为前一次modem 子系统重启未完成又触发了下一次modem 子系统重启,使得p_state为SUBSYS_RESTARTING从而导致panic;正常情况下是不应该出现这么频繁的子系统重启的,为防止这种情况,可以加个标志位,等待上一次modem 子系统重启完成才会进行下一次子系统重启。

模拟方法:

static void subsystem_restart_wq_func(struct work_struct *work)
{struct subsys_device *dev = container_of(work,struct subsys_device, work);struct subsys_device **list;struct subsys_desc *desc = dev->desc;struct subsys_soc_restart_order *order = dev->restart_order;struct subsys_tracking *track;unsigned int count;unsigned long flags;int ret;/** It's OK to not take the registration lock at this point.* This is because the subsystem list inside the relevant* restart order is not being traversed.*/if (order) {list = order->subsys_ptrs;count = order->count;track = &order->track;} else {list = &dev;count = 1;track = &dev->track;}/** If a system reboot/shutdown is under way, ignore subsystem errors.* However, print a message so that we know that a subsystem behaved* unexpectedly here.*/if(meig_work_flag==1){pr_err("wait complete at the last time\n");return; }meig_work_flag=1;if (system_state == SYSTEM_RESTART|| system_state == SYSTEM_POWER_OFF) {WARN(1, "SSR aborted: %s, system reboot/shutdown is under way\n",desc->name);pr_err("SSR aborted: %s, system reboot/shutdown is under way\n",desc->name);return;}mutex_lock(&track->lock);do_epoch_check(dev);if (dev->track.state == SUBSYS_OFFLINE) {mutex_unlock(&track->lock);WARN(1, "SSR aborted: %s subsystem not online\n", desc->name);pr_err("SSR aborted: %s subsystem not online\n",desc->name);return;}/** It's necessary to take the registration lock because the subsystem* list in the SoC restart order will be traversed and it shouldn't be* changed until _this_ restart sequence completes.*/mutex_lock(&soc_order_reg_lock);pr_err("[%s:%d]: Starting restart sequence for %s\n",current->comm, current->pid, desc->name);notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);if (ret)goto err;notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,NULL);spin_lock_irqsave(&track->s_lock, flags);track->p_state = SUBSYS_RESTARTING;spin_unlock_irqrestore(&track->s_lock, flags);//msleep(3000);/* Collect ram dumps for all subsystems in order here */for_each_subsys_device(list, count, NULL, subsystem_ramdump);for_each_subsys_device(list, count, NULL, subsystem_free_memory);notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);if (ret)goto err;notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);pr_err("[%s:%d]: Restart sequence for %s completed.\n",current->comm, current->pid, desc->name);
+	msleep(3000);//加个延时,通过QXDM发送命令send_data 75 37 03 00 00 触发modem死机;多次发送,即可出现panic("Subsystem %s crashed during SSR!", name);err:/* Reset subsys count */if (ret)dev->count = 0;//msleep(9000);mutex_unlock(&soc_order_reg_lock);mutex_unlock(&track->lock);spin_lock_irqsave(&track->s_lock, flags);pr_err("zhanghong 6666666666666\n");track->p_state = SUBSYS_NORMAL;meig_work_flag=0;__pm_relax(dev->ssr_wlock);spin_unlock_irqrestore(&track->s_lock, flags);
}

规避方案:
1.subsystem_restart_wq_func 函数开始的地方加个延时,等待上一次modem重启完成
2.panic(“Subsystem %s crashed during SSR!”, name); 改为仅打印


解决方案:

1.如何设置系统restart_level为related
device/qcom/common/rootdir/Android.mk

#<!-- Enable SSR for user version[Solution]Add use/debug control for init.qcom.rc. 
#LOCAL_SRC_FILES    := etc/init.qcom.rc
ifeq ($(TARGET_BUILD_VARIANT),user)LOCAL_SRC_FILES    := etc/init.qcom.user.rc
elseLOCAL_SRC_FILES    := etc/init.qcom.rc
endif
#END-->

device/qcom/common/rootdir/etc/init.qcom.user.rc

    #sensors log dirmkdir /data/vendor/sensorschown system system /data/vendor/sensors#<!-- Enable SSR Add use/debug control for init.qcom.rc. write /sys/bus/msm_subsys/devices/subsys0/restart_level relatedwrite /sys/bus/msm_subsys/devices/subsys1/restart_level relatedwrite /sys/bus/msm_subsys/devices/subsys2/restart_level relatedwrite /sys/bus/msm_subsys/devices/subsys3/restart_level related
#end--># msm specific files that need to be created on /data
on post-fs-datamkdir /data/vendor/misc 01771 system system

“qcom,ignore-ssr-failure” can be added in the following node of dtsi
pil_modem: qcom,mss@4080000

modem 如何和AP通,待更新。。。

本文标签: QCM6490 SSR 记述(一)