2023年6月23日发(作者:)
hive的⼀些指标计算已知⼀个点击⾏为,求在这之前该⽤户的点击路径已知点(DDD)--collect_list()//不去重create table ev_1 asselect _id,collect_set(_id) as ev_listfrom(select user_id,event_id,timefrom user_event,where from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')>='2020-03-05'and from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')<='2020-03-06'and day >='20200305' and day <='20200306'group by user_idORDER BY time) tbgroup by _id按照时间戳排序后列转⾏,找array中DDD并把ev_list转为字符串分割select _id,substr(_list,1,(index(_list,'DDD')+length('DDD'))) ev_path,_sizefrom(select user_id,concat_ws(',',ev_list) str_list,size(ev_list) as lst_sizefrom ev_1where array_contains(ev_list, 'DDD')
)tb2⽤户连续签到天数(取每个⽤户的天数最⼤值)列明:id,连续签到天数,开始时间,结束时间select userid,diff,qujian[0] starttime,qujian[size(qujian)-1] endtimefrom (select userid,date2,datediff(max(Time),min(Time)) diff,COLLECT_set(Time) qujian from ( select userid,Time,date_sub(Time,rank) as date2 from ( select userid,create_time Time,row_number () over (partition by userid order by create_time) as rank
from _xchz where date='201911' ) a ) a group by userid,date2)a⽤户埋点中,数据解析埋点事件中属性值清洗,事件值中解析出视频播放量,最后和视频表结合create table day_play_log (usernickname string,videoid string,videoplay string,attr_v string,cuid string,ct string) partitioned by (time string);show partitions day_play_log;--取当天视频播放量最⼤的id(历史数据取9⽉23号数据)INSERT INTO TABLE day_play_logPARTITION (time='2019-09-23')select [0] usernickname,[1] videoid,[3] videoplay,attr_v,cuid,ctfrom ( select split(attr_v,',') as ssmcs,attr_v,cuid,ct from dwd_user_evt_attr where date='20190923' and FROM_UNIXTIME(cast(ct/1000 as BIGINT),'yyyy-MM-dd')='2019-09-23'
and eid='视频详情' and attr='视频信息') tmp;⽤户昵称,视频id,播放量,视频详情长段(每天视频的最⼤播放量)create table day_video_play (usernickname string,videoid string,videoplay string,attr_v string) partitioned by (time string);show partitions day_video_play;INSERT INTO TABLE day_video_playPARTITION (time='2019-09-23')select ckname,d,lay,_vfrom(select ckname,d,lay,_v,row_number() over(partition by d order by lay desc) as rnfrom day_play_log tb1where time='2019-09-23')tmpwhere =1;⽤户注册后,6天内有登陆判定6天(d)=518400000 毫秒(ms)1天(d)=86400000毫秒(ms)create table tmp_nginx_log_2 asselect a.r_user_id,case when _TIME is null then 0 else from_unixtime(cast(_TIME/1000 as bigint)) end as CREATE_TIME,case when (ttime>=_TIME+86400000 and ttime<=_TIME+518400000) then 1 else 0 end as first_requestfrom _tb aleft join _user tb on ve_user_id=_IDleft join _nginx_log b on _id=where from_unixtime(cast(_time/1000 as bigint),'yyyy-MM-dd')>='2019-07-21'and from_unixtime(cast(_time/1000 as bigint),'yyyy-MM-dd')<='2019-08-14'清洗⽤户业务埋点,求第⼀次业务点击的点击⾏为day_C=`date -d yesterday +%F`INSERT INTO TABLE day_table
select event,count(1) number,'$day_C'
from
(select * from (select event,userid,row_number() over(partition by userid order by requesttime) as rn
group by event from (select tb1.* from day_event tb1, day_eventname tb2 where ='$month_day' and ='$month_day' and = _name)as tb3) as tb常⽤的统计⽅法PV:(页⾯访问量)select count(userid)from requesttimewhere time=''UV:独⽴访客,PV的去重版select count(distinct userid)from requesttimewhere time=''特殊埋点的点击次数,点击⼈数select count(userid),count(distinct userid)from eventwhere event_id in ('','','','','')⼴告事件点击次数⼈数,按照⼴告类型和渠道分类select 类型,渠道,count(userid),count(distinct userid)from ⼴告表group by 类型,渠道Hive中总体标准差的计算有两个函数可以使⽤,分别是stddev函数和stddev_pop函数:select stddev_pop(feature1) as std_feature1, stddev(feature3) as std_feature3,from
iris;样本标准差使⽤stddev_samp⽅法:select stddev_samp(feature1) as std_feature1,from
iris;中位数整数select percentile(feature1,0.5) as median_feature1 from iris;中位数浮点数select percentile_approx(feature1,0.5) as median_feature1 from iris;⽤户最近⼀次请求时间month_day=`date -d "-1 day" +%Y%m%d`day_C=`date -d "-1day" +%F`day_D=`date -d "-2day" +%F`hive -e"INSERT INTO TABLE day_nginx_userPARTITION (time='$day_C')
select distinct useridfrom user_nginx_logwhere date='$month_day' and (length(userid)=36 or length(userid)=32);"hive -e"INSERT INTO TABLE day_useir_requesttime_max
PARTITION (date='$day_C')select ,max(_time) as log_time from(select userid,log_timefrom day_useir_requesttime_maxwhere date='$day_D'union allselect userid,time as log_timefrom day_nginx_userwhere time='$day_C') tmpgroup by ;"
发布者:admin,转转请注明出处:http://www.yc00.com/web/1687518367a16437.html
评论列表(0条)