import pandas as pd
            import numpy as np
            import os
            import altair as alt
            alt.data_transformers.disable_max_rows()
            from datetime import datetime
            from scipy import stats
            import matplotlib.pyplot as plt

array([ 0,  0,  0, ..., 20, 20, 20])

0.0

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'

15320.870967741936

15352.516129032258

0.1630842353828084

'0.1630842353828083901579049097563256509602069854736328125000000000000000000000000000000000000000000000'

15782.0

29302.433333333334

6.590603584107244e-84


            import pandas as pd
            import numpy as np
            import os
            import altair as alt
            alt.data_transformers.disable_max_rows()
            from datetime import datetime
            from scipy import stats
            import matplotlib.pyplot as plt


            #cwd
            cwd=os.getcwd()


            - Exploring the datasets we corroborated that equal number of users were assigned to the . 
            -


            ## Loading the data from a .csv file 
            data = pd.read_csv(cwd+"/resources/assignments.csv")
            data.head()


            ## converting ts to date
            data['dt'] = data['ts'].map(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))
            data.head(5)


            data.describe()


            data.groupby(['groupid']).count()


            data_count=data.groupby(['groupid', 'dt']).count().reset_index()


            data_count.head()


            alt.Chart(data_count).mark_line(size=3).encode(
                alt.X('dt'),
                alt.Y('userid'),
                color='groupid:O',
                tooltip=['userid']
            ).properties(
                width=600,
                height=400
            )


            data_act = pd.read_csv(cwd+"/resources/activity_all.csv")


            data_act.head()


            data_act.groupby(['groupid','dt']).describe() 
            #we can already see a difference in mean activity and the 50% & 75% percentiles


            data_act.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index().head()


            alt.Chart(data_act.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index()).mark_line(size=3).encode(
                alt.X('dt'),
                alt.Y('userid'),
                color='groupid:O',
                tooltip=['userid']
            ).properties(
                width=600,
                height=400
            )


            (
                data_act.query('activity_level > 0 and groupid == 0 and dt >= "2021-11-01"')
                .groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()
            )


            (
                data_act.query('activity_level > 0 and groupid == 1 and dt >= "2021-11-01"')
                .groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()
            )


            data_act.query('dt >= "2021-11-01"').groupby(['groupid']).describe()


            data_act.query('dt < "2021-11-01"').groupby('groupid').describe()


            data_act_count = data_act.query('activity_level > 0').groupby(['groupid','dt']).count().reset_index()


            data_act_count.head()


            alt.Chart(data_act_count).mark_line(size=3).encode(
                alt.X('dt'),
                alt.Y('userid'),
                color='groupid:O',
                tooltip=['userid']
            ).properties(
                width=600,
                height=400
            )


            from scipy.stats import ttest_ind


            data_act.query('groupid == 0')['activity_level'].to_numpy()

array([ 0,  0,  0, ..., 20, 20, 20])


            res = ttest_ind(data_act.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                            data_act.query('groupid == 1 and dt >= "2021-11-01"')['activity_level'].to_numpy()).pvalue
            
            print(res)
            # Results
            # if res=1 there would be absolutely no difference. (you can test this by comparing groupid==0 with itself)
            # if res=0 means a significant high significance and therefore is extremely unlikely that the results would be explained by other than the a/b test implications

0.0


            "{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'


            before = data_act_count.query('dt < "2021-11-01"')


            after = data_act_count.query('dt >= "2021-11-01"')


            before.head()


            np.mean(before.query('groupid == 0')['userid'].to_numpy())

15320.870967741936


            np.mean(before.query('groupid == 1')['userid'].to_numpy())

15352.516129032258


            res = ttest_ind(before.query('groupid == 0')['userid'].to_numpy(), before.query('groupid == 1')['userid']
                            .to_numpy()).pvalue
            
            print(res)
            
            # Results
            # if res=1 there would be absolutely no difference. (you can test this by comparing groupid==0 with itself)
            
            # if res=0 means a significant high significance and therefore is extremely unlikely that 
            # the results would be explained by other than the a/b test implications

0.1630842353828084


            "{:.100f}".format(res)

'0.1630842353828083901579049097563256509602069854736328125000000000000000000000000000000000000000000000'


            np.mean(after.query('groupid == 0')['userid'].to_numpy())

15782.0


            np.mean(after.query('groupid == 1')['userid'].to_numpy())

29302.433333333334


            res = ttest_ind(after.query('groupid == 0')['userid'].to_numpy(), after.query('groupid == 1')['userid']
                            .to_numpy()).pvalue
            
            print(res)

6.590603584107244e-84


            "{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000065906035841072442'


            data_ctr = pd.read_csv(cwd+"/resources/ctr_all.csv")


            data_ctr.head()


            data_ctr_avg = data_ctr.groupby(['groupid','dt']).mean().reset_index()


            alt.Chart(data_ctr_avg).mark_line(size=5).encode(
                alt.X('dt'),
                alt.Y('ctr'),
                color='groupid:O',
                tooltip=['ctr']
            ).properties(
                width=600,
                height=400
            )


            before = data_ctr.query('dt < "2021-11-01"')[['groupid', 'ctr']]


            after = data_ctr.query('dt >= "2021-11-01"')[['groupid', 'ctr']]


            after


            before.query('groupid == 0')['ctr'].to_numpy().mean()

33.00091277553074


            before.query('groupid == 1')['ctr'].to_numpy().mean()

32.99957172093258


            after.query('groupid == 0')['ctr'].to_numpy().mean()

32.996977569382835


            after.query('groupid == 1')['ctr'].to_numpy().mean()

37.99695912626142


            before.query('groupid == 0')['ctr'].to_numpy().std()

1.7336979501682888


            before.query('groupid == 1')['ctr'].to_numpy().std()

1.7296548367391134


            after.query('groupid == 0')['ctr'].to_numpy().std()

1.7331985918552912


            after.query('groupid == 1')['ctr'].to_numpy().std()

1.7323710606903675


            res = ttest_ind(before.query('groupid == 0')['ctr'].to_numpy(), before.query('groupid == 1')['ctr']
                            .to_numpy()).pvalue
            
            print(res)
            
            # Results
            # if res=1 there would be absolutely no difference. (you can test this by comparing groupid==0 with itself)
            
            # if res=0 means a significant high significance and therefore is extremely unlikely that 
            # the results would be explained by other than the a/b test implications

0.705741417344299


            res = ttest_ind(after.query('groupid == 0')['ctr'].to_numpy(), after.query('groupid == 1')['ctr']
                            .to_numpy()).pvalue
            print(res)
            # Results
            # if res=1 there would be absolutely no difference. (you can test this by comparing groupid==0 with itself)
            
            # if res=0 means a significant high significance and therefore is extremely unlikely that 
            # the results would be explained by other than the a/b test implications

0.0


            "{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'

	userid	ts
0	c5d77c89-33a3-4fe3-9e31-179dec09d49c	2021-11-02T07:31:42Z
1	9061d751-7a94-44d3-8792-5ca5ec59aa89	2021-11-13T07:43:51Z
2	a5b70ae7-f07c-4773-9df4-ce112bc9dc48	2021-11-20T19:26:07Z
3	d2646662-269f-49de-aab1-8776afced9a3	2021-11-20T11:09:02Z
4	2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5	2021-11-04T07:42:07Z

	userid	ts	dt
0	c5d77c89-33a3-4fe3-9e31-179dec09d49c	2021-11-02T07:31:42Z	2021-11-02
1	9061d751-7a94-44d3-8792-5ca5ec59aa89	2021-11-13T07:43:51Z	2021-11-13
2	a5b70ae7-f07c-4773-9df4-ce112bc9dc48	2021-11-20T19:26:07Z	2021-11-20
3	d2646662-269f-49de-aab1-8776afced9a3	2021-11-20T11:09:02Z	2021-11-20
4	2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5	2021-11-04T07:42:07Z	2021-11-04

	userid	ts	dt
groupid
0	29951	29951	29951
1	30049	30049	30049

	dt	userid	ts
0	2021-11-01	1497	1497
1	2021-11-02	1467	1467
2	2021-11-03	1532	1532
3	2021-11-04	1509	1509
4	2021-11-05	1503	1503

	userid	dt	groupid
0	a5b70ae7-f07c-4773-9df4-ce112bc9dc48	2021-10-01	0
1	d2646662-269f-49de-aab1-8776afced9a3	2021-10-01	0
2	c4d1cfa8-283d-49ad-a894-90aedc39c798	2021-10-01	1
3	6889f87f-5356-4904-a35a-6ea5020011db	2021-10-01	0
4	dbee604c-474a-4c9d-b013-508e5a0e3059	2021-10-01	1

DESCRIPTION¶

SUMMARY¶

DATA EXPLORING¶

1) Assigments¶

User activity¶

Evaluating how many times a day were users in average active¶

Comparing the activity between the groups using T Test given that we have a large number of observations¶

By the number of active users¶

Checking for the pretest bias on activity.¶

Click through rate (CTR)¶

	groupid
count	60000.000000
mean	0.500817
std	0.500003
min	0.000000
25%	0.000000
50%	1.000000
75%	1.000000
max	1.000000

	dt	groupid	userid	activity_level
0	2021-10-01	0	15337	15337
1	2021-10-01	1	15297	15297
2	2021-10-02	0	15354	15354
3	2021-10-02	1	15421	15421
4	2021-10-03	0	15423	15423

	groupid	activity_level
count	30.0	30.000000
mean	0.0	15782.000000
std	0.0	371.077276
min	0.0	15163.000000
25%	0.0	15335.000000
50%	0.0	15990.500000
75%	0.0	16045.000000
max	0.0	16147.000000

	userid	dt	ctr
0	60389fa7-2d71-4cdf-831c-c2bb277ffa1e	2021-11-13	31.81
1	b59cb225-d160-4851-92d2-7cc8120a2f63	2021-11-13	30.46
2	aa336050-934e-453f-a5b0-dd881fcd114e	2021-11-13	34.25
3	8df767f4-a10f-4322-a722-676b7e02b372	2021-11-13	34.92
4	a74762ed-4da0-42ab-91d2-40d7e808dfe9	2021-11-13	34.95

		activity_level
		count	mean	std	min	25%	50%	75%	max
groupid	dt
0	2021-10-01	29951.0	5.241762	6.516640	0.0	0.0	1.0	10.0	20.0
	2021-10-02	29951.0	5.255885	6.509838	0.0	0.0	1.0	10.0	20.0
	2021-10-03	29951.0	5.266068	6.511458	0.0	0.0	1.0	10.0	20.0
	2021-10-04	29951.0	5.212447	6.511711	0.0	0.0	1.0	10.0	20.0
	2021-10-05	29951.0	5.177590	6.512791	0.0	0.0	1.0	10.0	20.0
...	...	...	...	...	...	...	...	...	...
1	2021-11-26	30049.0	10.031216	5.770582	0.0	5.0	10.0	15.0	20.0
	2021-11-27	30049.0	10.026024	5.774141	0.0	5.0	10.0	15.0	20.0
	2021-11-28	30049.0	9.975307	5.788257	0.0	5.0	10.0	15.0	20.0
	2021-11-29	30049.0	9.970781	5.799546	0.0	5.0	10.0	15.0	20.0
	2021-11-30	30049.0	9.963926	5.764812	0.0	5.0	10.0	15.0	20.0

	groupid	activity_level
count	30.0	30.000000
mean	1.0	29302.433333
std	0.0	30.417422
min	1.0	29255.000000
25%	1.0	29280.000000
50%	1.0	29300.000000
75%	1.0	29321.000000
max	1.0	29382.000000

	activity_level
	count	mean	std	min	25%	50%	75%	max
groupid
0	898530.0	5.402211	6.55557	0.0	0.0	1.0	11.0	20.0
1	901470.0	9.996304	5.78868	0.0	5.0	10.0	15.0	20.0

	groupid	ctr
0	0	31.81
1	0	30.46
2	0	34.25
3	0	34.92
4	0	34.95
...	...	...
2303403	1	37.27
2303404	1	39.14
2303405	1	40.05
2303406	1	38.14
2303407	1	37.98