import pandas as pd
            import numpy as np
            import os
            import altair as alt
            alt.data_transformers.disable_max_rows()
            from datetime import datetime
            from scipy import stats
            import matplotlib.pyplot as plt

       activity_level
            count    1.860000e+06
            mean     5.243289e+00
            std      6.520996e+00
            min      0.000000e+00
            25%      0.000000e+00
            50%      1.000000e+00
            75%      1.000000e+01
            max      2.000000e+01
            
            
            userid
            
            
            a5b70ae7-f07c-4773-9df4-ce112bc9dc48    31
            474f5fc6-3375-4eea-a6d6-8df099b45c74    31
            92afbe4b-4c92-4023-bd50-f3d24ec802bf    31
            5a7ff0fd-7410-4b9b-88d2-b34fec0a791b    31
            c4d1cfa8-283d-49ad-a894-90aedc39c798    31
                                                    ..
            26fbfd61-73cc-480f-8176-8ebac3e2b386    31
            2c955b66-27e4-454e-a0d2-f8adc745d806    31
            e411e5d4-fb5a-438d-a98e-1a6701abc68a    31
            11035ba2-ec17-4c48-a6a5-5f0a01df250c    31
            ae570647-3b27-4e1d-9a01-f77b8636c592    31
            Name: userid, Length: 60000, dtype: int64
            
            
            ------------------------
            
            
            dt
            
            
            2021-10-01    60000
            2021-10-04    60000
            2021-10-05    60000
            2021-10-06    60000
            2021-10-07    60000
            2021-10-08    60000
            2021-10-09    60000
            2021-10-10    60000
            2021-10-11    60000
            2021-10-12    60000
            2021-10-13    60000
            2021-10-14    60000
            2021-10-15    60000
            2021-10-02    60000
            2021-10-03    60000
            2021-10-16    60000
            2021-10-19    60000
            2021-10-20    60000
            2021-10-21    60000
            2021-10-22    60000
            2021-10-23    60000
            2021-10-24    60000
            2021-10-25    60000
            2021-10-26    60000
            2021-10-27    60000
            2021-10-28    60000
            2021-10-29    60000
            2021-10-30    60000
            2021-10-17    60000
            2021-10-18    60000
            2021-10-31    60000
            Name: dt, dtype: int64
            
            
            ------------------------
            
            
            activity_level
            
            
            20     24520
            7      48339
            17     48395
            8      48396
            13     48534
            4      48556
            15     48599
            14     48620
            3      48659
            1      48732
            9      48820
            11     48832
            19     48901
            6      48901
            12     48911
            16     48934
            10     48943
            18     48982
            2      49074
            5      49227
            0     909125
            Name: activity_level, dtype: int64
            
            
            ------------------------

0     909125
            1      48732
            2      49074
            3      48659
            4      48556
            5      49227
            6      48901
            7      48339
            8      48396
            9      48820
            10     48943
            11     48832
            12     48911
            13     48534
            14     48620
            15     48599
            16     48934
            17     48395
            18     48982
            19     48901
            20     24520
            Name: activity_level, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>

 For DAU:  
             
             Mean:  30673.387 
             Standard Deviation:  90.968 
             mde:  0.3 %

 For Clickrate:  
             
             Mean:  0.33 
             Standard Deviation:  0.0173 
             mde:  0.0524 %

 Using:  
             mean:  0.33 
             mde:  0.0173 
             alpha:  0.05 
             beta:  0.2 
             Z_alpha:  1.96 
             Z_beta:  0.84 
             p:  0.34 
             
             Obtained: 
            N=  11747.0 clicks

 Using:  
             U1:  30673.387 
             mde:  300 
             U2:  30373.387 
             sd:  90.968 
             alpha:  0.05 
             beta:  0.2 
             
             
             Obtained: 
             Z_beta:  0.84 
             Z_alpha:  1.96 
             
             N=  1.4 days


            import pandas as pd
            import numpy as np
            import os
            import altair as alt
            alt.data_transformers.disable_max_rows()
            from datetime import datetime
            from scipy import stats
            import matplotlib.pyplot as plt


            #cwd
            cwd=os.getcwd()


            ## Loading the data from a .csv file 
            data = pd.read_csv(cwd+"/resources/activity_pretest.csv")
            data.head()


            print(data.describe())
            print('\n')
            if len(data.columns.values) <10:
                for cols in data.columns.values:
                    print(cols)
                    print('\n')
                    print(data[cols].value_counts().sort_values())
                    print('\n')
                    print('------------------------')
                    print('\n')

       activity_level
            count    1.860000e+06
            mean     5.243289e+00
            std      6.520996e+00
            min      0.000000e+00
            25%      0.000000e+00
            50%      1.000000e+00
            75%      1.000000e+01
            max      2.000000e+01
            
            
            userid
            
            
            a5b70ae7-f07c-4773-9df4-ce112bc9dc48    31
            474f5fc6-3375-4eea-a6d6-8df099b45c74    31
            92afbe4b-4c92-4023-bd50-f3d24ec802bf    31
            5a7ff0fd-7410-4b9b-88d2-b34fec0a791b    31
            c4d1cfa8-283d-49ad-a894-90aedc39c798    31
                                                    ..
            26fbfd61-73cc-480f-8176-8ebac3e2b386    31
            2c955b66-27e4-454e-a0d2-f8adc745d806    31
            e411e5d4-fb5a-438d-a98e-1a6701abc68a    31
            11035ba2-ec17-4c48-a6a5-5f0a01df250c    31
            ae570647-3b27-4e1d-9a01-f77b8636c592    31
            Name: userid, Length: 60000, dtype: int64
            
            
            ------------------------
            
            
            dt
            
            
            2021-10-01    60000
            2021-10-04    60000
            2021-10-05    60000
            2021-10-06    60000
            2021-10-07    60000
            2021-10-08    60000
            2021-10-09    60000
            2021-10-10    60000
            2021-10-11    60000
            2021-10-12    60000
            2021-10-13    60000
            2021-10-14    60000
            2021-10-15    60000
            2021-10-02    60000
            2021-10-03    60000
            2021-10-16    60000
            2021-10-19    60000
            2021-10-20    60000
            2021-10-21    60000
            2021-10-22    60000
            2021-10-23    60000
            2021-10-24    60000
            2021-10-25    60000
            2021-10-26    60000
            2021-10-27    60000
            2021-10-28    60000
            2021-10-29    60000
            2021-10-30    60000
            2021-10-17    60000
            2021-10-18    60000
            2021-10-31    60000
            Name: dt, dtype: int64
            
            
            ------------------------
            
            
            activity_level
            
            
            20     24520
            7      48339
            17     48395
            8      48396
            13     48534
            4      48556
            15     48599
            14     48620
            3      48659
            1      48732
            9      48820
            11     48832
            19     48901
            6      48901
            12     48911
            16     48934
            10     48943
            18     48982
            2      49074
            5      49227
            0     909125
            Name: activity_level, dtype: int64
            
            
            ------------------------


            ## Exploring duplicates
            data[data["userid"].duplicated()]


            ## we can see that the duplication is not as such, simply activity distributed per days
            data[data["userid"]=='a8cd1579-44d4-48b3-b3d6-47ae5197dbc6'].groupby('dt').count()


            graph=data.activity_level.value_counts().sort_index()


            print(graph)
            plt.style.use('fivethirtyeight')
            plt.suptitle('Activity Level')
            plt.xlabel('activity level')
            plt.ylabel('num users')
            plt.bar(graph.index, graph.values)
            plt.show

0     909125
            1      48732
            2      49074
            3      48659
            4      48556
            5      49227
            6      48901
            7      48339
            8      48396
            9      48820
            10     48943
            11     48832
            12     48911
            13     48534
            14     48620
            15     48599
            16     48934
            17     48395
            18     48982
            19     48901
            20     24520
            Name: activity_level, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>


            data.groupby('activity_level').describe().head()


            activity = data.query('activity_level > 0').groupby(['dt', 'activity_level']).count().reset_index()


            activity


            chart1=alt.Chart(activity).mark_line(size=1).encode(
                alt.X('dt:T', axis=alt.Axis(title = 'date')),
                alt.Y('userid:Q', axis=alt.Axis(title = 'number of users')),
                tooltip=['activity_level'], 
                color='activity_level:N'
            ).properties(
                title='Daily Users per Activity Level'
            )


            activity = data.query('activity_level > 0').groupby(['dt']).count().reset_index()
            activity.describe()


            chart2=alt.Chart(activity).mark_line(size=4).encode(
                alt.X('dt:T', axis=alt.Axis(title = 'date')),
                alt.Y('userid:Q', axis=alt.Axis(title = 'number of users'))
            ).properties(
                title='Daily Active Users'
            )


            chart2 | chart1


            data2 = pd.read_csv(cwd+"/resources/ctr_pretest.csv")
            data2.head()


            data2.describe()


            ctr= data2.groupby(['dt']).mean().reset_index()
            ctr


            alt.Chart(ctr).mark_line(size=4).encode(
                alt.X('dt:T', axis=alt.Axis(title = 'date')),
                alt.Y('ctr:Q', axis=alt.Axis(title = 'ctr'), scale=alt.Scale(domain=[32, 34])),
                tooltip=['ctr'], 
            ).properties(
                width=600,
                height=400, 
                title='Average Daily CTR'
            )


            digits=3 #number of relevant digits
            mean=round(activity.userid.mean(),digits)
            std=round(activity.userid.std(),digits)
            mde=round(std/mean, digits)
            print(' For DAU: ', '\n','\n',
                  'Mean: ',mean, '\n', 
                  'Standard Deviation: ', std, '\n',
                  'mde: ', mde*100, '%',
                 )

 For DAU:  
             
             Mean:  30673.387 
             Standard Deviation:  90.968 
             mde:  0.3 %


            digits=4 #number of relevant digits
            #divided by 100 as it represents percentage
            ctr_mean=round(data2.ctr.mean()/100,digits)
            ctr_std=round(data2.ctr.std()/100,digits)
            ctr_mde=round(ctr_std/ctr_mean, digits)
            
            print(' For Clickrate: ', '\n','\n',
                  'Mean: ',ctr_mean, '\n', 
                  'Standard Deviation: ', ctr_std, '\n',
                  'mde: ',ctr_mde , '%'
                 )

 For Clickrate:  
             
             Mean:  0.33 
             Standard Deviation:  0.0173 
             mde:  0.0524 %


            alpha=0.05
            ss=1-alpha


            beta=0.2
            power=(1-beta)


            def binomial_sample_size(metric, mde, alpha, beta):
                # standard normal distribution to determine z-values
                snd = stats.norm(0, 1)
            
                Z_beta = snd.ppf(1-beta)
            
                Z_alpha = snd.ppf(1-alpha/2)
            
                # average of probabilities from both groups
                p = (metric + metric+mde) / 2
            
                N = (2 * p * 
                         (1 - p) * 
                         ((Z_beta + Z_alpha)**2
                         / mde**2))
            
                return Z_beta, Z_alpha, p, N


            #For CTR
            name='CTR'
            unit='clicks'
            metric=ctr_mean
            mde=ctr_std
            digits=2
            
            
            (Z_beta, Z_alpha, p, N)=binomial_sample_size(metric=metric, mde=mde, alpha=alpha, beta=beta)
            print(' Using: ', '\n', 
                  'mean: ', metric, '\n',
                  'mde: ' , mde,'\n',
                  'alpha: ',alpha,'\n',
                  'beta: ',beta, '\n',
                  'Z_alpha: ', round(Z_alpha,digits),'\n',
                  'Z_beta: ', round(Z_beta,digits),'\n',
                  'p: ', round(p,digits),'\n', '\n','Obtained:','\n'
                 'N= ', round(N,1),unit
                 )

 Using:  
             mean:  0.33 
             mde:  0.0173 
             alpha:  0.05 
             beta:  0.2 
             Z_alpha:  1.96 
             Z_beta:  0.84 
             p:  0.34 
             
             Obtained: 
            N=  11747.0 clicks


            def continuos_sample_size(metric, mde, sd, alpha, beta):
                # standard normal distribution to determine z-values
                snd = stats.norm(0, 1)
            
                Z_beta = snd.ppf(1-beta)
            
            
                Z_alpha = snd.ppf(1-alpha/2)
            
            
                N = (2 * sd**2 * 
                         (Z_beta + Z_alpha)**2
                         / mde**2)
            
                return Z_beta, Z_alpha, N


            #For DAU
            name='DAU'
            unit='days'
            mde=300
            
            
            
            (Z_beta, Z_alpha, N)=continuos_sample_size(metric=metric, mde=mde, sd=std, alpha=alpha, beta=beta)
            
            print(' Using: ', '\n', 
                  'U1: ', mean, '\n',
                  'mde: ' , mde,'\n',
                  'U2: ', mean-mde, '\n',
                  'sd: ' , sd,'\n',
                  'alpha: ',alpha,'\n',
                  'beta: ',beta, '\n',
                  '\n',
                  '\n','Obtained:','\n',
                  'Z_beta: ',round(Z_beta,digits),'\n',
                  'Z_alpha: ', round(Z_alpha,digits),'\n',
                  '\n','N= ', round(N,1), unit
                 )

 Using:  
             U1:  30673.387 
             mde:  300 
             U2:  30373.387 
             sd:  90.968 
             alpha:  0.05 
             beta:  0.2 
             
             
             Obtained: 
             Z_beta:  0.84 
             Z_alpha:  1.96 
             
             N=  1.4 days

	userid	dt	activity_level
29366	a5b70ae7-f07c-4773-9df4-ce112bc9dc48	2021-10-02	0
29368	eccec621-b2bb-4695-b817-5bc80e35028b	2021-10-02	0
29370	651d007e-4234-48a5-954b-5379e44c4f67	2021-10-02	0
29373	ba7f447a-bc16-4164-8c8d-3cd73dea17b3	2021-10-02	0
29377	aede54f3-1436-4eb9-8148-2df16703cc2d	2021-10-02	0
...	...	...	...
1859995	200d65e6-b1ce-4a47-8c2b-946db5c5a3a0	2021-10-31	20
1859996	535dafe4-de7c-4b56-acf6-aa94f21653bc	2021-10-31	20
1859997	0428ca3c-e666-4ef4-8588-3a2af904a123	2021-10-31	20
1859998	a8cd1579-44d4-48b3-b3d6-47ae5197dbc6	2021-10-31	20
1859999	bac5da9e-ef79-4ae9-9efe-cd6eca093db2	2021-10-31	20

	dt	activity_level	userid
0	2021-10-01	1	1602
1	2021-10-01	2	1507
2	2021-10-01	3	1587
3	2021-10-01	4	1551
4	2021-10-01	5	1586
...	...	...	...
615	2021-10-31	16	1499
616	2021-10-31	17	1534
617	2021-10-31	18	1531
618	2021-10-31	19	1616
619	2021-10-31	20	783

	userid	activity_level
count	31.000000	31.000000
mean	30673.387097	30673.387097
std	90.968375	90.968375
min	30489.000000	30489.000000
25%	30608.000000	30608.000000
50%	30661.000000	30661.000000
75%	30728.500000	30728.500000
max	30902.000000	30902.000000

	userid	dt	ctr
0	4b328144-df4b-47b1-a804-09834942dce0	2021-10-01	34.28
1	34ace777-5e9d-40b3-a859-4145d0c35c8d	2021-10-01	34.67
2	8028cccf-19c3-4c0e-b5b2-e707e15d2d83	2021-10-01	34.77
3	652b3c9c-5e29-4bf0-9373-924687b1567e	2021-10-01	35.42
4	45b57434-4666-4b57-9798-35489dc1092a	2021-10-01	35.04

DESCRIPTION¶

SUMMARY¶

DATA EXPLORING¶

1) Activity Level¶

Calculating Daily Active Users (DAU = Activity_level >0)¶

Analyzing data¶

2) Click Through Rate (CTR)¶

Benchmarking this data:¶

DESIGNING THE EXPERIMENT¶

Hypothesis & Null Hypothesis¶

Metrics¶

A/B Test set up¶

Duration and Sample Size¶

Sample Size¶

What type of test to use:¶

	userid	dt
0	a5b70ae7-f07c-4773-9df4-ce112bc9dc48	2021-10-01
1	d2646662-269f-49de-aab1-8776afced9a3	2021-10-01
2	c4d1cfa8-283d-49ad-a894-90aedc39c798	2021-10-01
3	6889f87f-5356-4904-a35a-6ea5020011db	2021-10-01
4	dbee604c-474a-4c9d-b013-508e5a0e3059	2021-10-01

	userid	activity_level
dt
2021-10-01	1	1
2021-10-02	1	1
2021-10-03	1	1
2021-10-04	1	1
2021-10-05	1	1
2021-10-06	1	1
2021-10-07	1	1
2021-10-08	1	1
2021-10-09	1	1
2021-10-10	1	1
2021-10-11	1	1
2021-10-12	1	1
2021-10-13	1	1
2021-10-14	1	1
2021-10-15	1	1
2021-10-16	1	1
2021-10-17	1	1
2021-10-18	1	1
2021-10-19	1	1
2021-10-20	1	1
2021-10-21	1	1
2021-10-22	1	1
2021-10-23	1	1
2021-10-24	1	1
2021-10-25	1	1
2021-10-26	1	1
2021-10-27	1	1
2021-10-28	1	1
2021-10-29	1	1
2021-10-30	1	1
2021-10-31	1	1

	userid				dt
	count	unique	top	freq	count	unique	top	freq
activity_level
0	909125	60000	6b953416-72e5-4b6e-b634-41c8d3bf98a4	27	909125	31	2021-10-11	29511
1	48732	33688	3c5297b6-602e-4479-9a97-e2b4cb444f0a	6	48732	31	2021-10-19	1620
2	49074	33761	3d5b7e5d-d7b8-459b-a4f0-33231fc930fd	6	49074	31	2021-10-14	1665
3	48659	33634	fd9d8064-2f3f-47ba-9deb-0a38bc0b1a3d	6	48659	31	2021-10-28	1663
4	48556	33502	dc396a83-174c-4244-8a33-71eae2283eeb	8	48556	31	2021-10-29	1632

	ctr
count	950875.000000
mean	33.000242
std	1.731677
min	30.000000
25%	31.500000
50%	33.000000
75%	34.500000
max	36.000000

	dt	ctr
0	2021-10-01	32.993446
1	2021-10-02	32.991664
2	2021-10-03	32.995086
3	2021-10-04	32.992995
4	2021-10-05	33.004375
5	2021-10-06	33.018564
6	2021-10-07	32.988500
7	2021-10-08	32.998654
8	2021-10-09	33.005082
9	2021-10-10	33.007134
10	2021-10-11	32.990300
11	2021-10-12	32.996166
12	2021-10-13	32.984248
13	2021-10-14	32.999878
14	2021-10-15	33.008517
15	2021-10-16	32.991025
16	2021-10-17	33.001919
17	2021-10-18	33.007763
18	2021-10-19	33.001511
19	2021-10-20	33.004632
20	2021-10-21	32.997566
21	2021-10-22	33.006785
22	2021-10-23	33.012228
23	2021-10-24	32.984093
24	2021-10-25	32.990223
25	2021-10-26	33.014248
26	2021-10-27	33.007045
27	2021-10-28	33.005711
28	2021-10-29	33.004230
29	2021-10-30	33.016430
30	2021-10-31	32.987515